17#include <hip/hip_runtime.h>
19#include "arcane/utils/PlatformUtils.h"
20#include "arcane/utils/NotSupportedException.h"
21#include "arcane/utils/Real3.h"
24#include "arcane/core/Item.h"
27#include "arcane/accelerator/hip/HipAccelerator.h"
28#include "arcane/accelerator/Runner.h"
29#include "arcane/accelerator/RunQueue.h"
31#include "arcane/accelerator/NumArray.h"
37__device__ __forceinline__
unsigned int getGlobalIdx_1D_1D()
39 unsigned int blockId = blockIdx.x;
40 unsigned int threadId = blockId * blockDim.x + threadIdx.x;
48 using reference_type = value_type&;
51 ARCCORE_HOST_DEVICE Privatizer(
const T& o) : priv{o} {}
52 ARCCORE_HOST_DEVICE reference_type get_priv() {
return priv; }
56ARCCORE_HOST_DEVICE
auto thread_privatize(
const T& item) ->
Privatizer<T>
61__global__
void MyVecAdd(
double* a,
double* b,
double* out)
63 int i = blockDim.x * blockIdx.x + threadIdx.x;
72 Int64 size = a.size();
73 Int64 i = blockDim.x * blockIdx.x + threadIdx.x;
84 Int32 size =
static_cast<Int32
>(a.extent0());
85 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
96 Int64 vsize = a.size();
97 for( Int64 i = 0; i<vsize; ++i ){
98 a[i] = (double)(i+base);
99 b[i] = (double)(i*i+base);
106 Int32 vsize =
static_cast<Int32
>(a.extent0());
107 for( Int32 i = 0; i<vsize; ++i ){
108 a(i) = (double)(i+base);
109 b(i) = (double)(i*i+base);
114template<
typename F> __global__
115void MyVecLambda(
int size,F func)
117 auto privatizer = thread_privatize(func);
118 auto& body = privatizer.get_priv();
120 int i = blockDim.x * blockIdx.x + threadIdx.x;
129 virtual __device__ __host__
void DoIt2() =0;
137 virtual __device__ __host__
void DoIt2()
override {}
147 auto k = [=](Context1& ctx){ std::cout <<
"A=" << ctx.a <<
"\n"; };
156 constexpr int vsize = 2000;
157 std::vector<double> a(vsize);
158 std::vector<double> b(vsize);
159 std::vector<double> out(vsize);
160 for(
size_t i = 0; i<vsize; ++i ){
161 a[i] = (double)(i+1);
162 b[i] = (double)(i*i+1);
165 size_t mem_size = vsize*
sizeof(double);
166 double* d_a =
nullptr;
167 ARCANE_CHECK_HIP(hipMalloc(&d_a,mem_size));
168 double* d_b =
nullptr;
169 ARCANE_CHECK_HIP(hipMalloc(&d_b,mem_size));
170 double* d_out =
nullptr;
171 ARCANE_CHECK_HIP(hipMalloc(&d_out,mem_size));
173 ARCANE_CHECK_HIP(hipMemcpy(d_a, a.data(), mem_size, hipMemcpyHostToDevice));
174 ARCANE_CHECK_HIP(hipMemcpy(d_b, b.data(), mem_size, hipMemcpyHostToDevice));
175 int threadsPerBlock = 256;
176 int blocksPerGrid = (vsize + threadsPerBlock - 1) / threadsPerBlock;
177 std::cout <<
"CALLING kernel tpb=" << threadsPerBlock <<
" bpg=" << blocksPerGrid <<
"\n";
178 hipLaunchKernelGGL(MyVecAdd, blocksPerGrid, threadsPerBlock , 0, 0, d_a,d_b,d_out);
179 ARCANE_CHECK_HIP(hipDeviceSynchronize());
180 ARCANE_CHECK_HIP(hipMemcpy(out.data(), d_out, mem_size, hipMemcpyDeviceToHost));
181 for(
size_t i=0; i<10; ++i )
182 std::cout <<
"V=" << out[i] <<
"\n";
190 constexpr int vsize = 2000;
191 size_t mem_size = vsize*
sizeof(double);
192 double* d_a =
nullptr;
193 ARCANE_CHECK_HIP(hipMallocManaged(&d_a,mem_size,hipMemAttachGlobal));
194 double* d_b =
nullptr;
195 ARCANE_CHECK_HIP(hipMallocManaged(&d_b,mem_size,hipMemAttachGlobal));
196 double* d_out =
nullptr;
197 ARCANE_CHECK_HIP(hipMallocManaged(&d_out,mem_size,hipMemAttachGlobal));
203 for(
size_t i = 0; i<vsize; ++i ){
204 d_a[i] = (double)(i+1);
205 d_b[i] = (double)(i*i+1);
212 int threadsPerBlock = 256;
213 int blocksPerGrid = (vsize + threadsPerBlock - 1) / threadsPerBlock;
214 std::cout <<
"CALLING kernel2 tpb=" << threadsPerBlock <<
" bpg=" << blocksPerGrid <<
"\n";
215 hipLaunchKernelGGL(MyVecAdd, blocksPerGrid, threadsPerBlock, 0, 0, d_a,d_b,d_out);
216 ARCANE_CHECK_HIP(hipDeviceSynchronize());
217 hipError_t e = hipGetLastError();
218 std::cout <<
"END OF MYVEC1 e=" << e <<
" v=" << hipGetErrorString(e) <<
"\n";
225 for(
size_t i=0; i<10; ++i )
226 std::cout <<
"V=" << d_out[i] <<
"\n";
234extern "C" int arcaneTestHip3()
236 std::cout <<
"TEST_HIP_3\n";
237 constexpr int vsize = 2000;
238 IMemoryAllocator* hip_allocator = Arcane::Accelerator::Hip::getHipMemoryAllocator();
241 ARCANE_FATAL(
"platform::getAcceleratorHostMemoryAllocator() is null");
247 for (
size_t i = 0; i < vsize; ++i) {
248 d_a[i] = (double)(i + 1);
249 d_b[i] = (double)(i * i + 1);
253 int threadsPerBlock = 256;
254 int blocksPerGrid = (vsize + threadsPerBlock - 1) / threadsPerBlock;
255 std::cout <<
"CALLING kernel2 tpb=" << threadsPerBlock <<
" bpg=" << blocksPerGrid <<
"\n";
256 hipLaunchKernelGGL(MyVecAdd2, blocksPerGrid, threadsPerBlock, 0, 0, d_a, d_b, d_out);
257 ARCANE_CHECK_HIP(hipDeviceSynchronize());
258 hipError_t e = hipGetLastError();
259 std::cout <<
"END OF MYVEC1 e=" << e <<
" v=" << hipGetErrorString(e) <<
"\n";
260 for (
size_t i = 0; i < 10; ++i)
261 std::cout <<
"V=" << d_out[i] <<
"\n";
265 _initArrays(d_a, d_b, d_out, 2);
267 dim3 dimGrid(threadsPerBlock, 1, 1), dimBlock(blocksPerGrid, 1, 1);
273 void* kernelArgs[] = {
280 ARCANE_CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
281 ARCANE_CHECK_HIP(hipLaunchKernel((
void*)MyVecAdd2, dimGrid, dimBlock, kernelArgs, smemSize, stream));
282 ARCANE_CHECK_HIP(hipStreamSynchronize(stream));
283 for (
size_t i = 0; i < 10; ++i)
284 std::cout <<
"V2=" << d_out[i] <<
"\n";
289 _initArrays(d_a, d_b, d_out, 3);
293 auto func = [=] ARCCORE_HOST_DEVICE(
int i) {
294 d_out_span[i] = d_a_span[i] + d_b_span[i];
299 hipLaunchKernelGGL(MyVecLambda, blocksPerGrid, threadsPerBlock, 0, 0, vsize, func);
300 ARCANE_CHECK_HIP(hipDeviceSynchronize());
301 for (
size_t i = 0; i < 10; ++i)
302 std::cout <<
"V3=" << d_out[i] <<
"\n";
304 _initArrays(d_a, d_b, d_out, 4);
307 for (
int i = 0; i < vsize; ++i)
309 for (
size_t i = 0; i < 10; ++i)
310 std::cout <<
"V4=" << d_out[i] <<
"\n";
317 for (
Integer i = 0; i < vsize; ++i) {
318 Real a = (Real)(i + 2);
319 Real b = (Real)(i * i + 3);
321 d_a3[i] =
Real3(a, a + 1.0, a + 2.0);
322 d_b3[i] =
Real3(b, b + 2.0, b + 3.0);
328 auto func2 = [=] ARCCORE_HOST_DEVICE(
int i) {
329 d_out_span[i] =
math::dot(d_a3_span[i], d_b3_span[i]);
334 hipLaunchKernelGGL(MyVecLambda, blocksPerGrid, threadsPerBlock, 0, 0, vsize, func2);
335 ARCANE_CHECK_HIP(hipDeviceSynchronize());
336 std::cout <<
"TEST WITH REAL3\n";
345extern "C" int arcaneTestHipNumArray()
347 std::cout <<
"TEST_HIP_NUM_ARRAY\n";
348 constexpr int vsize = 2000;
352 ARCANE_FATAL(
"platform::getAcceleratorHostMemoryAllocator() is null");
360 for (
int i = 0; i < vsize; ++i) {
361 d_a(i) = (double)(i + 1);
362 d_b(i) = (double)(i * i + 1);
366 int threadsPerBlock = 256;
367 int blocksPerGrid = (vsize + threadsPerBlock - 1) / threadsPerBlock;
368 std::cout <<
"CALLING kernel2 tpb=" << threadsPerBlock <<
" bpg=" << blocksPerGrid <<
"\n";
369 hipLaunchKernelGGL(MyVecAdd3, blocksPerGrid, threadsPerBlock, 0, 0, d_a, d_b, d_out);
370 ARCANE_CHECK_HIP(hipDeviceSynchronize());
371 hipError_t e = hipGetLastError();
372 std::cout <<
"END OF MYVEC1 e=" << e <<
" v=" << hipGetErrorString(e) <<
"\n";
373 for (
int i = 0; i < 10; ++i)
374 std::cout <<
"V=" << d_out(i) <<
"\n";
378 _initArrays(d_a, d_b, d_out, 2);
380 dim3 dimGrid(threadsPerBlock, 1, 1), dimBlock(blocksPerGrid, 1, 1);
386 void* kernelArgs[] = {
393 ARCANE_CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
394 ARCANE_CHECK_HIP(hipLaunchKernel((
void*)MyVecAdd2, dimGrid, dimBlock, kernelArgs, smemSize, stream));
395 ARCANE_CHECK_HIP(hipStreamSynchronize(stream));
396 for (
int i = 0; i < 10; ++i)
397 std::cout <<
"V2=" << d_out(i) <<
"\n";
402 _initArrays(d_a, d_b, d_out, 3);
406 auto func = [=] ARCCORE_HOST_DEVICE(
int i) {
407 d_out_span(i) = d_a_span(i) + d_b_span(i);
412 hipLaunchKernelGGL(MyVecLambda, blocksPerGrid, threadsPerBlock, 0, 0, vsize, func);
413 ARCANE_CHECK_HIP(hipDeviceSynchronize());
414 for (
int i = 0; i < 10; ++i)
415 std::cout <<
"V3=" << d_out(i) <<
"\n";
417 _initArrays(d_a, d_b, d_out, 4);
420 for (
int i = 0; i < vsize; ++i)
422 for (
int i = 0; i < 10; ++i)
423 std::cout <<
"V4=" << d_out(i) <<
"\n";
430 for (
Integer i = 0; i < vsize; ++i) {
431 Real a = (Real)(i + 2);
432 Real b = (Real)(i * i + 3);
435 d_a3(i, 1) = a + 1.0;
436 d_a3(i, 2) = a + 2.0;
439 d_b3(i, 1) = b + 1.0;
440 d_b3(i, 2) = b + 2.0;
446 auto func2 = [=] ARCCORE_HOST_DEVICE(
int i) {
447 Real3 xa(d_a3_span(i, 0), d_a3_span(i, 1), d_a3_span(i, 2));
448 Real3 xb(d_b3_span(i, 0), d_b3_span(i, 1), d_b3_span(i, 2));
454 hipLaunchKernelGGL(MyVecLambda, blocksPerGrid, threadsPerBlock, 0, 0, vsize, func2);
455 ARCANE_CHECK_HIP(hipDeviceSynchronize());
456 std::cout <<
"TEST WITH REAL3\n";
472 std::cout <<
"TestReduction vsize=" << vsize <<
"\n";
477 for (
Integer i = 0; i < vsize; ++i) {
478 int a = 5 + ((i + 2) % 43);
495 double vxa = (double)(xa[i]);
497 sum_reducer.add(xa[i]);
498 sum_double_reducer.add(vxa);
499 max_int_reducer.max(xa[i]);
500 max_double_reducer.max(vxa);
501 min_int_reducer.min(xa[i]);
502 min_double_reducer.min(vxa);
507 int sum_int_value = sum_reducer.reduce();
508 double sum_double_value = sum_double_reducer.reduce();
509 std::cout <<
"SumReducer name=" << name <<
" v_int=" << sum_int_value
510 <<
" v_double=" << sum_double_value
512 int max_int_value = max_int_reducer.reduce();
513 double max_double_value = max_double_reducer.reduce();
514 std::cout <<
"MaxReducer name=" << name <<
" v_int=" << max_int_value
515 <<
" v_double=" << max_double_value
517 int min_int_value = min_int_reducer.reduce();
518 double min_double_value = min_double_reducer.reduce();
519 std::cout <<
"MinReducer name=" << name <<
" v_int=" << min_int_value
520 <<
" v_double=" << min_double_value
526__device__
int my_add(
int a,
int b) {
530__device__
int mul(
int a,
int b) {
537__global__
void compute(
int *d_result,
int N,
int (*op_func)(
int,
int)) {
538 int idx = threadIdx.x + blockIdx.x * blockDim.x;
543 d_result[idx] = op_func(idx, idx);
546using BinaryFuncType = int (*)(
int a,
int b);
548__device__ int (*my_func_ptr)(
int a,
int b) = my_add;
550__global__
void kernelSetFunction(BinaryFuncType* func_ptr)
558 static __device__
int doFunc(
int a,
int b)
569 virtual ARCCORE_HOST_DEVICE
int apply(
int a,
int b) =0;
576 ARCCORE_HOST_DEVICE
int apply(
int a,
int b)
override {
return a+b;}
579__global__
void compute_virtual(
int* d_result,
int N,
FooBase* ptr)
584 int idx = threadIdx.x + blockIdx.x * blockDim.x;
589 d_result[idx] = ptr->apply(idx, idx);
593__global__
void createFooDerived(
FooDerived* ptr)
595 int idx = threadIdx.x + blockIdx.x * blockDim.x;
602int arcaneTestVirtualFunction()
604 std::cout <<
"Test function pointer\n";
612 ARCANE_CHECK_HIP(hipMalloc(&foo_derived,
sizeof(
FooDerived)));
613 createFooDerived<<<1, 1>>>(foo_derived);
614 ARCANE_CHECK_HIP(hipDeviceSynchronize());
616 ARCANE_CHECK_HIP(hipMalloc(&d_result, N *
sizeof(
int)));
618 int (*host_func)(int, int) =
nullptr;
619 ARCANE_CHECK_HIP(hipMalloc(&host_func,
sizeof(
void*) * 8));
623 std::cout <<
"Set function pointer\n";
626 std::cout <<
"Wait end\n";
628 ARCANE_CHECK_HIP(hipDeviceSynchronize());
630 std::cout <<
"Calling compute\n";
636 compute_virtual<<<1, N>>>(d_result, N, foo_derived);
637 ARCANE_CHECK_HIP(hipDeviceSynchronize());
640 ARCANE_CHECK_HIP(hipMemcpy(h_result, d_result, N *
sizeof(
int), hipMemcpyDeviceToHost));
642 for (
int i = 0; i < N; ++i) {
643 printf(
"%d ", h_result[i]);
647 ARCANE_CHECK_HIP(hipFree(d_result));
654extern "C" int arcaneTestHipReduction()
657 std::cout <<
"Test Reductions\n";
664 int sizes_to_test[] = { 56, 567, 4389, 452182 };
665 for (
int i = 0; i < 4; ++i) {
666 int vsize = sizes_to_test[i];
667 arcaneTestHipReductionX(vsize, queue1,
"Sequential");
668 arcaneTestHipReductionX(vsize, queue2,
"Thread");
669 arcaneTestHipReductionX(vsize, queue3,
"HIP");
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Fonctions mathématiques diverses.
Fonctions de gestion mémoire et des allocateurs.
Types et fonctions pour gérer les synchronisations sur les accélérateurs.
Types et macros pour gérer les boucles sur les accélérateurs.
#define RUNCOMMAND_LOOP1(iter_name, x1,...)
Boucle sur accélérateur avec arguments supplémentaires pour les réductions.
Classe pour effectuer une réduction 'max'.
Classe pour effectuer une réduction 'min'.
Classe pour effectuer une réduction 'somme'.
Gestion d'une commande sur accélérateur.
File d'exécution pour un accélérateur.
Gestionnaire d'exécution pour accélérateur.
Interface d'un allocateur pour la mémoire.
Classe de base des vues multi-dimensionnelles.
Tableaux multi-dimensionnels pour les types numériques accessibles sur accélérateurs.
MDSpanType span()
Vue multi-dimension sur l'instance.
MDSpanType mdspan()
Vue multi-dimension sur l'instance.
ConstMDSpanType constMDSpan() const
Vue constante multi-dimension sur l'instance.
void resize(Int32 dim1_size)
Modifie la taille du tableau en gardant pas les valeurs actuelles.
Classe gérant un vecteur de réel de dimension 3.
Vue d'un tableau d'éléments de type T.
Chaîne de caractères unicode.
Vecteur 1D de données avec sémantique par valeur (style STL).
__host__ __device__ Real dot(Real2 u, Real2 v)
Produit scalaire de u par v dans .
Espace de nom pour l'utilisation des accélérateurs.
RunCommand makeCommand(const RunQueue &run_queue)
Créé une commande associée à la file run_queue.
RunQueue makeQueue(const Runner &runner)
Créé une file associée à runner.
@ HIP
Politique d'exécution utilisant l'environnement HIP.
@ Sequential
Politique d'exécution séquentielle.
@ Thread
Politique d'exécution multi-thread.
IMemoryAllocator * getDefaultDataAllocator()
Allocateur par défaut pour les données.
-*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
Int32 Integer
Type représentant un entier.
Espace de nom de Arccore.