Arcane  v3.15.0.0
Documentation développeur
Chargement...
Recherche...
Aucune correspondance
Test.cu.cc
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2023 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* Test.cu.cc (C) 2000-2023 */
9/* */
10/* Fichier contenant les tests pour l'implémentation HIP. */
11/*---------------------------------------------------------------------------*/
12/*---------------------------------------------------------------------------*/
13
14#include <vector>
15#include <iostream>
16
17#include <hip/hip_runtime.h>
18
19#include "arcane/utils/PlatformUtils.h"
20#include "arcane/utils/NotSupportedException.h"
21#include "arcane/utils/Real3.h"
22#include "arcane/Item.h"
23#include "arcane/MathUtils.h"
24
25#include "arcane/accelerator/hip/HipAccelerator.h"
26#include "arcane/accelerator/Runner.h"
27#include "arcane/accelerator/RunQueue.h"
29#include "arcane/accelerator/NumArray.h"
30
31using namespace Arccore;
32using namespace Arcane;
33using namespace Arcane::Accelerator;
34
35__device__ __forceinline__ unsigned int getGlobalIdx_1D_1D()
36{
37 unsigned int blockId = blockIdx.x;
38 unsigned int threadId = blockId * blockDim.x + threadIdx.x;
39 return threadId;
40}
41
42template <typename T>
44{
45 using value_type = T; //std::decay<T>;
46 using reference_type = value_type&;
47 value_type priv;
48
49 ARCCORE_HOST_DEVICE Privatizer(const T& o) : priv{o} {}
50 ARCCORE_HOST_DEVICE reference_type get_priv() { return priv; }
51};
52
53template <typename T>
54ARCCORE_HOST_DEVICE auto thread_privatize(const T& item) -> Privatizer<T>
55{
56 return Privatizer<T>{item};
57}
58
59__global__ void MyVecAdd(double* a,double* b,double* out)
60{
61 int i = blockDim.x * blockIdx.x + threadIdx.x;
62 out[i] = a[i] + b[i];
63 if (i<10){
64 // printf("A=%d %lf %lf %lf %d\n",i,a[i],b[i],out[i],3);
65 }
66}
67
69{
70 Int64 size = a.size();
71 Int64 i = blockDim.x * blockIdx.x + threadIdx.x;
72 if (i>=size)
73 return;
74 out[i] = a[i] + b[i];
75 if (i<10){
76 //printf("A=%d %lf %lf %lf %d\n",i,a[i],b[i],out[i],i);
77 }
78}
79
81{
82 Int32 size = static_cast<Int32>(a.extent0());
83 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
84 if (i>=size)
85 return;
86 out(i) = a(i) + b(i);
87 if (i<10){
88 //printf("A=%d %lf %lf %lf %d\n",i,a(i),b(i),out(i),i);
89 }
90}
91
92void _initArrays(Span<double> a,Span<double> b,Span<double> c,int base)
93{
94 Int64 vsize = a.size();
95 for( Int64 i = 0; i<vsize; ++i ){
96 a[i] = (double)(i+base);
97 b[i] = (double)(i*i+base);
98 c[i] = 0.0;
99 }
100}
101
103{
104 Int32 vsize = static_cast<Int32>(a.extent0());
105 for( Int32 i = 0; i<vsize; ++i ){
106 a(i) = (double)(i+base);
107 b(i) = (double)(i*i+base);
108 c(i) = 0.0;
109 }
110}
111
112template<typename F> __global__
113void MyVecLambda(int size,F func)
114{
115 auto privatizer = thread_privatize(func);
116 auto& body = privatizer.get_priv();
117
118 int i = blockDim.x * blockIdx.x + threadIdx.x;
119 if (i<size)
120 body(i);
121}
122
123namespace TestCuda
124{
125class IA
126{
127 virtual __device__ __host__ void DoIt2() =0;
128};
129
130class A
131: public IA
132{
133 public:
134 //__global__ void DoIt(){}
135 virtual __device__ __host__ void DoIt2() override {}
136};
137}
138
139void MyTestFunc1()
140{
141 struct Context1
142 {
143 int a;
144 };
145 auto k = [=](Context1& ctx){ std::cout << "A=" << ctx.a << "\n"; };
147 my_ctx.a = 3;
148 k(my_ctx);
149}
150
151extern "C"
152int arcaneTestHip1()
153{
154 constexpr int vsize = 2000;
155 std::vector<double> a(vsize);
156 std::vector<double> b(vsize);
157 std::vector<double> out(vsize);
158 for( size_t i = 0; i<vsize; ++i ){
159 a[i] = (double)(i+1);
160 b[i] = (double)(i*i+1);
161 out[i] = 0.0; //a[i] + b[i];
162 }
163 size_t mem_size = vsize*sizeof(double);
164 double* d_a = nullptr;
165 ARCANE_CHECK_HIP(hipMalloc(&d_a,mem_size));
166 double* d_b = nullptr;
167 ARCANE_CHECK_HIP(hipMalloc(&d_b,mem_size));
168 double* d_out = nullptr;
169 ARCANE_CHECK_HIP(hipMalloc(&d_out,mem_size));
170
171 ARCANE_CHECK_HIP(hipMemcpy(d_a, a.data(), mem_size, hipMemcpyHostToDevice));
172 ARCANE_CHECK_HIP(hipMemcpy(d_b, b.data(), mem_size, hipMemcpyHostToDevice));
173 int threadsPerBlock = 256;
174 int blocksPerGrid = (vsize + threadsPerBlock - 1) / threadsPerBlock;
175 std::cout << "CALLING kernel tpb=" << threadsPerBlock << " bpg=" << blocksPerGrid << "\n";
177 ARCANE_CHECK_HIP(hipDeviceSynchronize());
178 ARCANE_CHECK_HIP(hipMemcpy(out.data(), d_out, mem_size, hipMemcpyDeviceToHost));
179 for( size_t i=0; i<10; ++i )
180 std::cout << "V=" << out[i] << "\n";
181 return 0;
182}
183
184extern "C"
185int arcaneTestHip2()
186{
187 MyTestFunc1();
188 constexpr int vsize = 2000;
189 size_t mem_size = vsize*sizeof(double);
190 double* d_a = nullptr;
192 double* d_b = nullptr;
194 double* d_out = nullptr;
196
197 //d_a = new double[vsize];
198 //d_b = new double[vsize];
199 //d_out = new double[vsize];
200
201 for( size_t i = 0; i<vsize; ++i ){
202 d_a[i] = (double)(i+1);
203 d_b[i] = (double)(i*i+1);
204 d_out[i] = 0.0; //a[i] + b[i];
205 }
206
207
208 //hipMemcpy(d_a, a.data(), mem_size, hipMemcpyHostToDevice);
209 //hipMemcpy(d_b, b.data(), mem_size, hipMemcpyHostToDevice);
210 int threadsPerBlock = 256;
211 int blocksPerGrid = (vsize + threadsPerBlock - 1) / threadsPerBlock;
212 std::cout << "CALLING kernel2 tpb=" << threadsPerBlock << " bpg=" << blocksPerGrid << "\n";
214 ARCANE_CHECK_HIP(hipDeviceSynchronize());
216 std::cout << "END OF MYVEC1 e=" << e << " v=" << hipGetErrorString(e) << "\n";
217 //hipDeviceSynchronize();
218 //e = hipGetLastError();
219 //std::cout << "END OF MYVEC2 e=" << e << " v=" << hipGetErrorString(e) << "\n";
220 //hipMemcpy(out.data(), d_out, mem_size, hipMemcpyDeviceToHost);
221 //e = hipGetLastError();
222 //std::cout << "END OF MYVEC3 e=" << e << " v=" << hipGetErrorString(e) << "\n";
223 for( size_t i=0; i<10; ++i )
224 std::cout << "V=" << d_out[i] << "\n";
225
226 return 0;
227}
228
229/*---------------------------------------------------------------------------*/
230/*---------------------------------------------------------------------------*/
231
232extern "C"
233int arcaneTestHip3()
234{
235 std::cout << "TEST_HIP_3\n";
236 constexpr int vsize = 2000;
237 IMemoryAllocator* hip_allocator = Arcane::Accelerator::Hip::getHipMemoryAllocator();
238 IMemoryAllocator* hip_allocator2 = Arcane::platform::getAcceleratorHostMemoryAllocator();
239 if (!hip_allocator2)
240 ARCANE_FATAL("platform::getAcceleratorHostMemoryAllocator() is null");
242 MyTestFunc1();
245
246 for( size_t i = 0; i<vsize; ++i ){
247 d_a[i] = (double)(i+1);
248 d_b[i] = (double)(i*i+1);
249 d_out[i] = 0.0; //a[i] + b[i];
250 }
251
252
253 int threadsPerBlock = 256;
254 int blocksPerGrid = (vsize + threadsPerBlock - 1) / threadsPerBlock;
255 std::cout << "CALLING kernel2 tpb=" << threadsPerBlock << " bpg=" << blocksPerGrid << "\n";
257 ARCANE_CHECK_HIP(hipDeviceSynchronize());
259 std::cout << "END OF MYVEC1 e=" << e << " v=" << hipGetErrorString(e) << "\n";
260 for( size_t i=0; i<10; ++i )
261 std::cout << "V=" << d_out[i] << "\n";
262
263 // Lance un noyau dynamiquement
264 {
265 _initArrays(d_a,d_b,d_out,2);
266
268
272
273 void *kernelArgs[] = {
274 (void*)&d_a_span,
275 (void*)&d_b_span,
276 (void*)&d_out_view
277 };
278 size_t smemSize = 0;
279 hipStream_t stream;
280 ARCANE_CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
281 ARCANE_CHECK_HIP(hipLaunchKernel((void *)MyVecAdd2, dimGrid, dimBlock, kernelArgs, smemSize, stream));
282 ARCANE_CHECK_HIP(hipStreamSynchronize(stream));
283 for( size_t i=0; i<10; ++i )
284 std::cout << "V2=" << d_out[i] << "\n";
285 }
286
287 // Lance une lambda
288 {
289 _initArrays(d_a,d_b,d_out,3);
293 auto func = [=] ARCCORE_HOST_DEVICE (int i)
294 {
295 d_out_span[i] = d_a_span[i] + d_b_span[i];
296 if (i<10){
297 //printf("A=%d %lf %lf %lf\n",i,d_a_span[i],d_b_span[i],d_out_span[i]);
298 }};
299
300 hipLaunchKernelGGL(MyVecLambda, blocksPerGrid, threadsPerBlock, 0, 0, vsize,func);
301 ARCANE_CHECK_HIP(hipDeviceSynchronize());
302 for( size_t i=0; i<10; ++i )
303 std::cout << "V3=" << d_out[i] << "\n";
304
305 _initArrays(d_a,d_b,d_out,4);
306
307 // Appelle la version 'hote' de la lambda
308 for( int i=0; i<vsize; ++i )
309 func(i);
310 for( size_t i=0; i<10; ++i )
311 std::cout << "V4=" << d_out[i] << "\n";
312 }
313
314 // Utilise les Real3
315 {
318 for( Integer i=0; i<vsize; ++i ){
319 Real a = (Real)(i+2);
320 Real b = (Real)(i*i+3);
321
322 d_a3[i] = Real3(a,a+1.0,a+2.0);
323 d_b3[i] = Real3(b,b+2.0,b+3.0);
324 }
325
329 auto func2 = [=] ARCCORE_HOST_DEVICE (int i) {
330 d_out_span[i] = math::dot(d_a3_span[i],d_b3_span[i]);
331 if (i<10){
332 //printf("DOT=%d %lf\n",i,d_out_span[i]);
333 }
334 };
335 hipLaunchKernelGGL(MyVecLambda, blocksPerGrid, threadsPerBlock, 0, 0, vsize,func2);
336 ARCANE_CHECK_HIP(hipDeviceSynchronize());
337 std::cout << "TEST WITH REAL3\n";
338 }
339
340 return 0;
341}
342
343/*---------------------------------------------------------------------------*/
344/*---------------------------------------------------------------------------*/
345
346extern "C"
347int arcaneTestHipNumArray()
348{
349 std::cout << "TEST_HIP_NUM_ARRAY\n";
350 constexpr int vsize = 2000;
351 //IMemoryAllocator* hip_allocator = Arcane::Accelerator::Hip::getHipMemoryAllocator();
352 IMemoryAllocator* hip_allocator2 = Arcane::platform::getAcceleratorHostMemoryAllocator();
353 if (!hip_allocator2)
354 ARCANE_FATAL("platform::getAcceleratorHostMemoryAllocator() is null");
356 MyTestFunc1();
359 d_a.resize(vsize);
360 d_b.resize(vsize);
361 d_out.resize(vsize);
362 for( int i = 0; i<vsize; ++i ){
363 d_a(i) = (double)(i+1);
364 d_b(i) = (double)(i*i+1);
365 d_out(i) = 0.0; //a[i] + b[i];
366 }
367
368
369 int threadsPerBlock = 256;
370 int blocksPerGrid = (vsize + threadsPerBlock - 1) / threadsPerBlock;
371 std::cout << "CALLING kernel2 tpb=" << threadsPerBlock << " bpg=" << blocksPerGrid << "\n";
373 ARCANE_CHECK_HIP(hipDeviceSynchronize());
375 std::cout << "END OF MYVEC1 e=" << e << " v=" << hipGetErrorString(e) << "\n";
376 for( int i=0; i<10; ++i )
377 std::cout << "V=" << d_out(i) << "\n";
378
379 // Lance un noyau dynamiquement
380 {
381 _initArrays(d_a,d_b,d_out,2);
382
384
388
389 void *kernelArgs[] = {
390 (void*)&d_a_span,
391 (void*)&d_b_span,
392 (void*)&d_out_view
393 };
394 size_t smemSize = 0;
395 hipStream_t stream;
396 ARCANE_CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
397 ARCANE_CHECK_HIP(hipLaunchKernel((void *)MyVecAdd2, dimGrid, dimBlock, kernelArgs, smemSize, stream));
398 ARCANE_CHECK_HIP(hipStreamSynchronize(stream));
399 for( int i=0; i<10; ++i )
400 std::cout << "V2=" << d_out(i) << "\n";
401 }
402
403 // Lance une lambda
404 {
405 _initArrays(d_a,d_b,d_out,3);
409 auto func = [=] ARCCORE_HOST_DEVICE (int i)
410 {
411 d_out_span(i) = d_a_span(i) + d_b_span(i);
412 if (i<10){
413 //printf("A=%d %lf %lf %lf\n",i,d_a_span(i),d_b_span(i),d_out_span(i));
414 }};
415
416 hipLaunchKernelGGL(MyVecLambda, blocksPerGrid, threadsPerBlock, 0, 0, vsize,func);
417 ARCANE_CHECK_HIP(hipDeviceSynchronize());
418 for( int i=0; i<10; ++i )
419 std::cout << "V3=" << d_out(i) << "\n";
420
421 _initArrays(d_a,d_b,d_out,4);
422
423 // Appelle la version 'hote' de la lambda
424 for( int i=0; i<vsize; ++i )
425 func(i);
426 for( int i=0; i<10; ++i )
427 std::cout << "V4=" << d_out(i) << "\n";
428 }
429
430 // Utilise les Real3 avec un tableau multi-dimensionel
431 {
434 for( Integer i=0; i<vsize; ++i ){
435 Real a = (Real)(i+2);
436 Real b = (Real)(i*i+3);
437
438 d_a3(i,0) = a;
439 d_a3(i,1) = a + 1.0;
440 d_a3(i,2) = a + 2.0;
441
442 d_b3(i,0) = b;
443 d_b3(i,1) = b + 1.0;
444 d_b3(i,2) = b + 2.0;
445 }
446
450 auto func2 = [=] ARCCORE_HOST_DEVICE (int i) {
451 Real3 xa(d_a3_span(i,0),d_a3_span(i,1),d_a3_span(i,2));
452 Real3 xb(d_b3_span(i,0),d_b3_span(i,1),d_b3_span(i,2));
453 d_out_span(i) = math::dot(xa,xb);
454 if (i<10){
455 //printf("DOT NUMARRAY=%d %lf\n",i,d_out_span(i));
456 }
457 };
458 hipLaunchKernelGGL(MyVecLambda, blocksPerGrid, threadsPerBlock, 0, 0, vsize,func2);
459 ARCANE_CHECK_HIP(hipDeviceSynchronize());
460 std::cout << "TEST WITH REAL3\n";
461 }
462
463 return 0;
464}
465
466/*---------------------------------------------------------------------------*/
467/*---------------------------------------------------------------------------*/
468
470
471namespace ax = Arcane::Accelerator;
472
473void arcaneTestHipReductionX(int vsize,ax::RunQueue& queue,const String& name)
474{
475 using namespace Arcane::Accelerator;
476 std::cout << "TestReduction vsize=" << vsize << "\n";
477 IMemoryAllocator* hip_allocator2 = Arcane::platform::getAcceleratorHostMemoryAllocator();
480
481 for( Integer i=0; i<vsize; ++i ){
482 int a = 5 + ((i+2) % 43);
483 d_a[i] = a;
484 d_out[i] = 0;
485 //std::cout << "I=" << i << " a=" << a << "\n";
486 }
487 RunCommand command = makeCommand(queue);
494 Span<const int> xa = d_a.span();
495 Span<int> xout = d_out.span();
496 command << RUNCOMMAND_LOOP1(idx, vsize)
497 {
498 auto [i] = idx();
499 double vxa = (double)(xa[i]);
500 xout[i] = xa[i];
501 sum_reducer.add(xa[i]);
503 max_int_reducer.max(xa[i]);
505 min_int_reducer.min(xa[i]);
507 //if (i<10)
508 //printf("Do Reduce i=%d v=%d %lf\n",i,xa[i],vxa);
509 };
510
511 int sum_int_value = sum_reducer.reduce();
512 double sum_double_value = sum_double_reducer.reduce();
513 std::cout << "SumReducer name=" << name << " v_int=" << sum_int_value
514 << " v_double=" << sum_double_value
515 << "\n";
516 int max_int_value = max_int_reducer.reduce();
517 double max_double_value = max_double_reducer.reduce();
518 std::cout << "MaxReducer name=" << name << " v_int=" << max_int_value
519 << " v_double=" << max_double_value
520 << "\n";
521 int min_int_value = min_int_reducer.reduce();
522 double min_double_value = min_double_reducer.reduce();
523 std::cout << "MinReducer name=" << name << " v_int=" << min_int_value
524 << " v_double=" << min_double_value
525 << "\n";
526}
527
528/*---------------------------------------------------------------------------*/
529/*---------------------------------------------------------------------------*/
530
531extern "C"
532int arcaneTestHipReduction()
533{
534 // TODO: tester en ne commancant pas par 0.
535 ax::Runner runner_seq(ax::eExecutionPolicy::Sequential);
536 ax::Runner runner_thread(ax::eExecutionPolicy::Thread);
537 ax::Runner runner_hip(ax::eExecutionPolicy::HIP);
538 runner_hip.setDeviceReducePolicy(ax::eDeviceReducePolicy::Grid);
542 int sizes_to_test[] = { 56, 567, 4389, 452182 };
543 for( int i=0; i<4; ++i ){
544 int vsize = sizes_to_test[i];
545 arcaneTestHipReductionX(vsize,queue1,"Sequential");
546 arcaneTestHipReductionX(vsize,queue2,"Thread");
547 arcaneTestHipReductionX(vsize,queue3,"HIP");
548 }
549 return 0;
550}
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Types et fonctions pour gérer les synchronisations sur les accélérateurs.
Types et macros pour gérer les boucles sur les accélérateurs.
#define RUNCOMMAND_LOOP1(iter_name, x1,...)
Boucle sur accélérateur avec arguments supplémentaires pour les réductions.
Gestion d'une commande sur accélérateur.
File d'exécution pour un accélérateur.
Gestionnaire d'exécution pour accélérateur.
Definition core/Runner.h:68
Lecteur des fichiers de maillage via la bibliothèque LIMA.
Definition Lima.cc:149
Classe gérant un vecteur de réel de dimension 3.
Definition Real3.h:132
Chaîne de caractères unicode.
Espace de nom pour l'utilisation des accélérateurs.
RunCommand makeCommand(const RunQueue &run_queue)
Créé une commande associée à la file run_queue.
RunQueue makeQueue(const Runner &runner)
Créé une file associée à runner.
-*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
Espace de nom de Arccore.
Definition ArcaneTypes.h:24
double Real
Type représentant un réel.
Int32 Integer
Type représentant un entier.