17#include <hip/hip_runtime.h>
19#include "arcane/utils/PlatformUtils.h"
20#include "arcane/utils/NotSupportedException.h"
21#include "arcane/utils/Real3.h"
22#include "arcane/Item.h"
23#include "arcane/MathUtils.h"
25#include "arcane/accelerator/hip/HipAccelerator.h"
26#include "arcane/accelerator/Runner.h"
27#include "arcane/accelerator/RunQueue.h"
29#include "arcane/accelerator/NumArray.h"
35__device__ __forceinline__
unsigned int getGlobalIdx_1D_1D()
37 unsigned int blockId = blockIdx.x;
38 unsigned int threadId = blockId * blockDim.x + threadIdx.x;
46 using reference_type = value_type&;
49 ARCCORE_HOST_DEVICE Privatizer(
const T& o) : priv{o} {}
50 ARCCORE_HOST_DEVICE reference_type get_priv() {
return priv; }
54ARCCORE_HOST_DEVICE
auto thread_privatize(
const T& item) ->
Privatizer<T>
59__global__
void MyVecAdd(
double* a,
double* b,
double* out)
61 int i = blockDim.x * blockIdx.x + threadIdx.x;
70 Int64 size = a.size();
71 Int64 i = blockDim.x * blockIdx.x + threadIdx.x;
82 Int32 size =
static_cast<Int32
>(a.extent0());
83 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
94 Int64 vsize = a.size();
95 for( Int64 i = 0; i<vsize; ++i ){
96 a[i] = (double)(i+base);
97 b[i] = (double)(i*i+base);
104 Int32 vsize =
static_cast<Int32
>(a.extent0());
105 for( Int32 i = 0; i<vsize; ++i ){
106 a(i) = (double)(i+base);
107 b(i) = (double)(i*i+base);
112template<
typename F> __global__
113void MyVecLambda(
int size,F func)
115 auto privatizer = thread_privatize(func);
116 auto& body = privatizer.get_priv();
118 int i = blockDim.x * blockIdx.x + threadIdx.x;
127 virtual __device__ __host__
void DoIt2() =0;
135 virtual __device__ __host__
void DoIt2()
override {}
145 auto k = [=](Context1& ctx){ std::cout <<
"A=" << ctx.a <<
"\n"; };
154 constexpr int vsize = 2000;
155 std::vector<double> a(vsize);
156 std::vector<double> b(vsize);
157 std::vector<double> out(vsize);
158 for(
size_t i = 0; i<vsize; ++i ){
159 a[i] = (double)(i+1);
160 b[i] = (double)(i*i+1);
163 size_t mem_size = vsize*
sizeof(double);
164 double* d_a =
nullptr;
165 ARCANE_CHECK_HIP(hipMalloc(&d_a,mem_size));
166 double* d_b =
nullptr;
167 ARCANE_CHECK_HIP(hipMalloc(&d_b,mem_size));
168 double* d_out =
nullptr;
169 ARCANE_CHECK_HIP(hipMalloc(&d_out,mem_size));
171 ARCANE_CHECK_HIP(hipMemcpy(d_a, a.data(), mem_size, hipMemcpyHostToDevice));
172 ARCANE_CHECK_HIP(hipMemcpy(d_b, b.data(), mem_size, hipMemcpyHostToDevice));
173 int threadsPerBlock = 256;
174 int blocksPerGrid = (vsize + threadsPerBlock - 1) / threadsPerBlock;
175 std::cout <<
"CALLING kernel tpb=" << threadsPerBlock <<
" bpg=" << blocksPerGrid <<
"\n";
176 hipLaunchKernelGGL(MyVecAdd, blocksPerGrid, threadsPerBlock , 0, 0, d_a,d_b,d_out);
177 ARCANE_CHECK_HIP(hipDeviceSynchronize());
178 ARCANE_CHECK_HIP(hipMemcpy(out.data(), d_out, mem_size, hipMemcpyDeviceToHost));
179 for(
size_t i=0; i<10; ++i )
180 std::cout <<
"V=" << out[i] <<
"\n";
188 constexpr int vsize = 2000;
189 size_t mem_size = vsize*
sizeof(double);
190 double* d_a =
nullptr;
191 ARCANE_CHECK_HIP(hipMallocManaged(&d_a,mem_size,hipMemAttachGlobal));
192 double* d_b =
nullptr;
193 ARCANE_CHECK_HIP(hipMallocManaged(&d_b,mem_size,hipMemAttachGlobal));
194 double* d_out =
nullptr;
195 ARCANE_CHECK_HIP(hipMallocManaged(&d_out,mem_size,hipMemAttachGlobal));
201 for(
size_t i = 0; i<vsize; ++i ){
202 d_a[i] = (double)(i+1);
203 d_b[i] = (double)(i*i+1);
210 int threadsPerBlock = 256;
211 int blocksPerGrid = (vsize + threadsPerBlock - 1) / threadsPerBlock;
212 std::cout <<
"CALLING kernel2 tpb=" << threadsPerBlock <<
" bpg=" << blocksPerGrid <<
"\n";
213 hipLaunchKernelGGL(MyVecAdd, blocksPerGrid, threadsPerBlock, 0, 0, d_a,d_b,d_out);
214 ARCANE_CHECK_HIP(hipDeviceSynchronize());
215 hipError_t e = hipGetLastError();
216 std::cout <<
"END OF MYVEC1 e=" << e <<
" v=" << hipGetErrorString(e) <<
"\n";
223 for(
size_t i=0; i<10; ++i )
224 std::cout <<
"V=" << d_out[i] <<
"\n";
235 std::cout <<
"TEST_HIP_3\n";
236 constexpr int vsize = 2000;
237 IMemoryAllocator* hip_allocator = Arcane::Accelerator::Hip::getHipMemoryAllocator();
240 ARCANE_FATAL(
"platform::getAcceleratorHostMemoryAllocator() is null");
246 for(
size_t i = 0; i<vsize; ++i ){
247 d_a[i] = (double)(i+1);
248 d_b[i] = (double)(i*i+1);
253 int threadsPerBlock = 256;
254 int blocksPerGrid = (vsize + threadsPerBlock - 1) / threadsPerBlock;
255 std::cout <<
"CALLING kernel2 tpb=" << threadsPerBlock <<
" bpg=" << blocksPerGrid <<
"\n";
256 hipLaunchKernelGGL(MyVecAdd2, blocksPerGrid, threadsPerBlock, 0, 0, d_a,d_b,d_out);
257 ARCANE_CHECK_HIP(hipDeviceSynchronize());
258 hipError_t e = hipGetLastError();
259 std::cout <<
"END OF MYVEC1 e=" << e <<
" v=" << hipGetErrorString(e) <<
"\n";
260 for(
size_t i=0; i<10; ++i )
261 std::cout <<
"V=" << d_out[i] <<
"\n";
265 _initArrays(d_a,d_b,d_out,2);
267 dim3 dimGrid(threadsPerBlock, 1, 1), dimBlock(blocksPerGrid, 1, 1);
273 void *kernelArgs[] = {
280 ARCANE_CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
281 ARCANE_CHECK_HIP(hipLaunchKernel((
void *)MyVecAdd2, dimGrid, dimBlock, kernelArgs, smemSize, stream));
282 ARCANE_CHECK_HIP(hipStreamSynchronize(stream));
283 for(
size_t i=0; i<10; ++i )
284 std::cout <<
"V2=" << d_out[i] <<
"\n";
289 _initArrays(d_a,d_b,d_out,3);
293 auto func = [=] ARCCORE_HOST_DEVICE (
int i)
295 d_out_span[i] = d_a_span[i] + d_b_span[i];
300 hipLaunchKernelGGL(MyVecLambda, blocksPerGrid, threadsPerBlock, 0, 0, vsize,func);
301 ARCANE_CHECK_HIP(hipDeviceSynchronize());
302 for(
size_t i=0; i<10; ++i )
303 std::cout <<
"V3=" << d_out[i] <<
"\n";
305 _initArrays(d_a,d_b,d_out,4);
308 for(
int i=0; i<vsize; ++i )
310 for(
size_t i=0; i<10; ++i )
311 std::cout <<
"V4=" << d_out[i] <<
"\n";
318 for(
Integer i=0; i<vsize; ++i ){
319 Real a = (Real)(i+2);
320 Real b = (Real)(i*i+3);
322 d_a3[i] =
Real3(a,a+1.0,a+2.0);
323 d_b3[i] =
Real3(b,b+2.0,b+3.0);
329 auto func2 = [=] ARCCORE_HOST_DEVICE (
int i) {
330 d_out_span[i] =
math::dot(d_a3_span[i],d_b3_span[i]);
335 hipLaunchKernelGGL(MyVecLambda, blocksPerGrid, threadsPerBlock, 0, 0, vsize,func2);
336 ARCANE_CHECK_HIP(hipDeviceSynchronize());
337 std::cout <<
"TEST WITH REAL3\n";
347int arcaneTestHipNumArray()
349 std::cout <<
"TEST_HIP_NUM_ARRAY\n";
350 constexpr int vsize = 2000;
354 ARCANE_FATAL(
"platform::getAcceleratorHostMemoryAllocator() is null");
362 for(
int i = 0; i<vsize; ++i ){
363 d_a(i) = (double)(i+1);
364 d_b(i) = (double)(i*i+1);
369 int threadsPerBlock = 256;
370 int blocksPerGrid = (vsize + threadsPerBlock - 1) / threadsPerBlock;
371 std::cout <<
"CALLING kernel2 tpb=" << threadsPerBlock <<
" bpg=" << blocksPerGrid <<
"\n";
372 hipLaunchKernelGGL(MyVecAdd3, blocksPerGrid, threadsPerBlock, 0, 0, d_a,d_b,d_out);
373 ARCANE_CHECK_HIP(hipDeviceSynchronize());
374 hipError_t e = hipGetLastError();
375 std::cout <<
"END OF MYVEC1 e=" << e <<
" v=" << hipGetErrorString(e) <<
"\n";
376 for(
int i=0; i<10; ++i )
377 std::cout <<
"V=" << d_out(i) <<
"\n";
381 _initArrays(d_a,d_b,d_out,2);
383 dim3 dimGrid(threadsPerBlock, 1, 1), dimBlock(blocksPerGrid, 1, 1);
389 void *kernelArgs[] = {
396 ARCANE_CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
397 ARCANE_CHECK_HIP(hipLaunchKernel((
void *)MyVecAdd2, dimGrid, dimBlock, kernelArgs, smemSize, stream));
398 ARCANE_CHECK_HIP(hipStreamSynchronize(stream));
399 for(
int i=0; i<10; ++i )
400 std::cout <<
"V2=" << d_out(i) <<
"\n";
405 _initArrays(d_a,d_b,d_out,3);
409 auto func = [=] ARCCORE_HOST_DEVICE (
int i)
411 d_out_span(i) = d_a_span(i) + d_b_span(i);
416 hipLaunchKernelGGL(MyVecLambda, blocksPerGrid, threadsPerBlock, 0, 0, vsize,func);
417 ARCANE_CHECK_HIP(hipDeviceSynchronize());
418 for(
int i=0; i<10; ++i )
419 std::cout <<
"V3=" << d_out(i) <<
"\n";
421 _initArrays(d_a,d_b,d_out,4);
424 for(
int i=0; i<vsize; ++i )
426 for(
int i=0; i<10; ++i )
427 std::cout <<
"V4=" << d_out(i) <<
"\n";
434 for(
Integer i=0; i<vsize; ++i ){
435 Real a = (Real)(i+2);
436 Real b = (Real)(i*i+3);
450 auto func2 = [=] ARCCORE_HOST_DEVICE (
int i) {
451 Real3 xa(d_a3_span(i,0),d_a3_span(i,1),d_a3_span(i,2));
452 Real3 xb(d_b3_span(i,0),d_b3_span(i,1),d_b3_span(i,2));
458 hipLaunchKernelGGL(MyVecLambda, blocksPerGrid, threadsPerBlock, 0, 0, vsize,func2);
459 ARCANE_CHECK_HIP(hipDeviceSynchronize());
460 std::cout <<
"TEST WITH REAL3\n";
476 std::cout <<
"TestReduction vsize=" << vsize <<
"\n";
481 for(
Integer i=0; i<vsize; ++i ){
482 int a = 5 + ((i+2) % 43);
499 double vxa = (double)(xa[i]);
501 sum_reducer.add(xa[i]);
502 sum_double_reducer.add(vxa);
503 max_int_reducer.max(xa[i]);
504 max_double_reducer.max(vxa);
505 min_int_reducer.min(xa[i]);
506 min_double_reducer.min(vxa);
511 int sum_int_value = sum_reducer.reduce();
512 double sum_double_value = sum_double_reducer.reduce();
513 std::cout <<
"SumReducer name=" << name <<
" v_int=" << sum_int_value
514 <<
" v_double=" << sum_double_value
516 int max_int_value = max_int_reducer.reduce();
517 double max_double_value = max_double_reducer.reduce();
518 std::cout <<
"MaxReducer name=" << name <<
" v_int=" << max_int_value
519 <<
" v_double=" << max_double_value
521 int min_int_value = min_int_reducer.reduce();
522 double min_double_value = min_double_reducer.reduce();
523 std::cout <<
"MinReducer name=" << name <<
" v_int=" << min_int_value
524 <<
" v_double=" << min_double_value
532int arcaneTestHipReduction()
542 int sizes_to_test[] = { 56, 567, 4389, 452182 };
543 for(
int i=0; i<4; ++i ){
544 int vsize = sizes_to_test[i];
545 arcaneTestHipReductionX(vsize,queue1,
"Sequential");
546 arcaneTestHipReductionX(vsize,queue2,
"Thread");
547 arcaneTestHipReductionX(vsize,queue3,
"HIP");
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Types et fonctions pour gérer les synchronisations sur les accélérateurs.
Types et macros pour gérer les boucles sur les accélérateurs.
#define RUNCOMMAND_LOOP1(iter_name, x1,...)
Boucle sur accélérateur avec arguments supplémentaires pour les réductions.
Classe pour effectuer une réduction 'max'.
Classe pour effectuer une réduction 'min'.
Classe pour effectuer une réduction 'somme'.
Gestion d'une commande sur accélérateur.
File d'exécution pour un accélérateur.
Gestionnaire d'exécution pour accélérateur.
Interface d'un allocateur pour la mémoire.
Classe de base des vues multi-dimensionnelles.
Tableaux multi-dimensionnels pour les types numériques accessibles sur accélérateurs.
MDSpanType span()
Vue multi-dimension sur l'instance.
ConstMDSpanType constSpan() const
Vue constante multi-dimension sur l'instance.
void resize(Int32 dim1_size)
Modifie la taille du tableau en gardant pas les valeurs actuelles.
Classe gérant un vecteur de réel de dimension 3.
Vue d'un tableau d'éléments de type T.
Chaîne de caractères unicode.
Vecteur 1D de données avec sémantique par valeur (style STL).
__host__ __device__ Real dot(Real2 u, Real2 v)
Produit scalaire de u par v dans .
Espace de nom pour l'utilisation des accélérateurs.
RunCommand makeCommand(const RunQueue &run_queue)
Créé une commande associée à la file run_queue.
@ Grid
Utilise un noyau de calcul avec une synchronisations entre les blocs.
RunQueue makeQueue(const Runner &runner)
Créé une file associée à runner.
@ HIP
Politique d'exécution utilisant l'environnement HIP.
@ Sequential
Politique d'exécution séquentielle.
@ Thread
Politique d'exécution multi-thread.
-*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
Int32 Integer
Type représentant un entier.
Espace de nom de Arccore.