Arcane  v4.1.0.0
Documentation développeur
Chargement...
Recherche...
Aucune correspondance
Test.cu.cc
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2025 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* Test.cu.cc (C) 2000-2025 */
9/* */
10/* Fichier contenant les tests pour l'implémentation HIP. */
11/*---------------------------------------------------------------------------*/
12/*---------------------------------------------------------------------------*/
13
14#include <vector>
15#include <iostream>
16
17#include <hip/hip_runtime.h>
18
19#include "arcane/utils/PlatformUtils.h"
20#include "arcane/utils/NotSupportedException.h"
21#include "arcane/utils/Real3.h"
23
24#include "arcane/core/Item.h"
26
27#include "arcane/accelerator/hip/HipAccelerator.h"
28#include "arcane/accelerator/Runner.h"
29#include "arcane/accelerator/RunQueue.h"
31#include "arcane/accelerator/NumArray.h"
32
33using namespace Arccore;
34using namespace Arcane;
35using namespace Arcane::Accelerator;
36
37__device__ __forceinline__ unsigned int getGlobalIdx_1D_1D()
38{
39 unsigned int blockId = blockIdx.x;
40 unsigned int threadId = blockId * blockDim.x + threadIdx.x;
41 return threadId;
42}
43
44template <typename T>
45struct Privatizer
46{
47 using value_type = T; //std::decay<T>;
48 using reference_type = value_type&;
49 value_type priv;
50
51 ARCCORE_HOST_DEVICE Privatizer(const T& o) : priv{o} {}
52 ARCCORE_HOST_DEVICE reference_type get_priv() { return priv; }
53};
54
55template <typename T>
56ARCCORE_HOST_DEVICE auto thread_privatize(const T& item) -> Privatizer<T>
57{
58 return Privatizer<T>{item};
59}
60
61__global__ void MyVecAdd(double* a,double* b,double* out)
62{
63 int i = blockDim.x * blockIdx.x + threadIdx.x;
64 out[i] = a[i] + b[i];
65 if (i<10){
66 // printf("A=%d %lf %lf %lf %d\n",i,a[i],b[i],out[i],3);
67 }
68}
69
70__global__ void MyVecAdd2(Span<const double> a,Span<const double>b,Span<double> out)
71{
72 Int64 size = a.size();
73 Int64 i = blockDim.x * blockIdx.x + threadIdx.x;
74 if (i>=size)
75 return;
76 out[i] = a[i] + b[i];
77 if (i<10){
78 //printf("A=%d %lf %lf %lf %d\n",i,a[i],b[i],out[i],i);
79 }
80}
81
83{
84 Int32 size = static_cast<Int32>(a.extent0());
85 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
86 if (i>=size)
87 return;
88 out(i) = a(i) + b(i);
89 if (i<10){
90 //printf("A=%d %lf %lf %lf %d\n",i,a(i),b(i),out(i),i);
91 }
92}
93
94void _initArrays(Span<double> a,Span<double> b,Span<double> c,int base)
95{
96 Int64 vsize = a.size();
97 for( Int64 i = 0; i<vsize; ++i ){
98 a[i] = (double)(i+base);
99 b[i] = (double)(i*i+base);
100 c[i] = 0.0;
101 }
102}
103
105{
106 Int32 vsize = static_cast<Int32>(a.extent0());
107 for( Int32 i = 0; i<vsize; ++i ){
108 a(i) = (double)(i+base);
109 b(i) = (double)(i*i+base);
110 c(i) = 0.0;
111 }
112}
113
114template<typename F> __global__
115void MyVecLambda(int size,F func)
116{
117 auto privatizer = thread_privatize(func);
118 auto& body = privatizer.get_priv();
119
120 int i = blockDim.x * blockIdx.x + threadIdx.x;
121 if (i<size)
122 body(i);
123}
124
125namespace TestCuda
126{
127class IA
128{
129 virtual __device__ __host__ void DoIt2() =0;
130};
131
132class A
133: public IA
134{
135 public:
136 //__global__ void DoIt(){}
137 virtual __device__ __host__ void DoIt2() override {}
138};
139}
140
141void MyTestFunc1()
142{
143 struct Context1
144 {
145 int a;
146 };
147 auto k = [=](Context1& ctx){ std::cout << "A=" << ctx.a << "\n"; };
148 Context1 my_ctx;
149 my_ctx.a = 3;
150 k(my_ctx);
151}
152
153extern "C"
154int arcaneTestHip1()
155{
156 constexpr int vsize = 2000;
157 std::vector<double> a(vsize);
158 std::vector<double> b(vsize);
159 std::vector<double> out(vsize);
160 for( size_t i = 0; i<vsize; ++i ){
161 a[i] = (double)(i+1);
162 b[i] = (double)(i*i+1);
163 out[i] = 0.0; //a[i] + b[i];
164 }
165 size_t mem_size = vsize*sizeof(double);
166 double* d_a = nullptr;
167 ARCANE_CHECK_HIP(hipMalloc(&d_a,mem_size));
168 double* d_b = nullptr;
169 ARCANE_CHECK_HIP(hipMalloc(&d_b,mem_size));
170 double* d_out = nullptr;
171 ARCANE_CHECK_HIP(hipMalloc(&d_out,mem_size));
172
173 ARCANE_CHECK_HIP(hipMemcpy(d_a, a.data(), mem_size, hipMemcpyHostToDevice));
174 ARCANE_CHECK_HIP(hipMemcpy(d_b, b.data(), mem_size, hipMemcpyHostToDevice));
175 int threadsPerBlock = 256;
176 int blocksPerGrid = (vsize + threadsPerBlock - 1) / threadsPerBlock;
177 std::cout << "CALLING kernel tpb=" << threadsPerBlock << " bpg=" << blocksPerGrid << "\n";
178 hipLaunchKernelGGL(MyVecAdd, blocksPerGrid, threadsPerBlock , 0, 0, d_a,d_b,d_out);
179 ARCANE_CHECK_HIP(hipDeviceSynchronize());
180 ARCANE_CHECK_HIP(hipMemcpy(out.data(), d_out, mem_size, hipMemcpyDeviceToHost));
181 for( size_t i=0; i<10; ++i )
182 std::cout << "V=" << out[i] << "\n";
183 return 0;
184}
185
186extern "C"
187int arcaneTestHip2()
188{
189 MyTestFunc1();
190 constexpr int vsize = 2000;
191 size_t mem_size = vsize*sizeof(double);
192 double* d_a = nullptr;
193 ARCANE_CHECK_HIP(hipMallocManaged(&d_a,mem_size,hipMemAttachGlobal));
194 double* d_b = nullptr;
195 ARCANE_CHECK_HIP(hipMallocManaged(&d_b,mem_size,hipMemAttachGlobal));
196 double* d_out = nullptr;
197 ARCANE_CHECK_HIP(hipMallocManaged(&d_out,mem_size,hipMemAttachGlobal));
198
199 //d_a = new double[vsize];
200 //d_b = new double[vsize];
201 //d_out = new double[vsize];
202
203 for( size_t i = 0; i<vsize; ++i ){
204 d_a[i] = (double)(i+1);
205 d_b[i] = (double)(i*i+1);
206 d_out[i] = 0.0; //a[i] + b[i];
207 }
208
209
210 //hipMemcpy(d_a, a.data(), mem_size, hipMemcpyHostToDevice);
211 //hipMemcpy(d_b, b.data(), mem_size, hipMemcpyHostToDevice);
212 int threadsPerBlock = 256;
213 int blocksPerGrid = (vsize + threadsPerBlock - 1) / threadsPerBlock;
214 std::cout << "CALLING kernel2 tpb=" << threadsPerBlock << " bpg=" << blocksPerGrid << "\n";
215 hipLaunchKernelGGL(MyVecAdd, blocksPerGrid, threadsPerBlock, 0, 0, d_a,d_b,d_out);
216 ARCANE_CHECK_HIP(hipDeviceSynchronize());
217 hipError_t e = hipGetLastError();
218 std::cout << "END OF MYVEC1 e=" << e << " v=" << hipGetErrorString(e) << "\n";
219 //hipDeviceSynchronize();
220 //e = hipGetLastError();
221 //std::cout << "END OF MYVEC2 e=" << e << " v=" << hipGetErrorString(e) << "\n";
222 //hipMemcpy(out.data(), d_out, mem_size, hipMemcpyDeviceToHost);
223 //e = hipGetLastError();
224 //std::cout << "END OF MYVEC3 e=" << e << " v=" << hipGetErrorString(e) << "\n";
225 for( size_t i=0; i<10; ++i )
226 std::cout << "V=" << d_out[i] << "\n";
227
228 return 0;
229}
230
231/*---------------------------------------------------------------------------*/
232/*---------------------------------------------------------------------------*/
233
234extern "C" int arcaneTestHip3()
235{
236 std::cout << "TEST_HIP_3\n";
237 constexpr int vsize = 2000;
238 IMemoryAllocator* hip_allocator = Arcane::Accelerator::Hip::getHipMemoryAllocator();
240 if (!hip_allocator2)
241 ARCANE_FATAL("platform::getAcceleratorHostMemoryAllocator() is null");
242 UniqueArray<double> d_a(hip_allocator, vsize);
243 MyTestFunc1();
244 UniqueArray<double> d_b(hip_allocator, vsize);
245 UniqueArray<double> d_out(hip_allocator, vsize);
246
247 for (size_t i = 0; i < vsize; ++i) {
248 d_a[i] = (double)(i + 1);
249 d_b[i] = (double)(i * i + 1);
250 d_out[i] = 0.0; //a[i] + b[i];
251 }
252
253 int threadsPerBlock = 256;
254 int blocksPerGrid = (vsize + threadsPerBlock - 1) / threadsPerBlock;
255 std::cout << "CALLING kernel2 tpb=" << threadsPerBlock << " bpg=" << blocksPerGrid << "\n";
256 hipLaunchKernelGGL(MyVecAdd2, blocksPerGrid, threadsPerBlock, 0, 0, d_a, d_b, d_out);
257 ARCANE_CHECK_HIP(hipDeviceSynchronize());
258 hipError_t e = hipGetLastError();
259 std::cout << "END OF MYVEC1 e=" << e << " v=" << hipGetErrorString(e) << "\n";
260 for (size_t i = 0; i < 10; ++i)
261 std::cout << "V=" << d_out[i] << "\n";
262
263 // Lance un noyau dynamiquement
264 {
265 _initArrays(d_a, d_b, d_out, 2);
266
267 dim3 dimGrid(threadsPerBlock, 1, 1), dimBlock(blocksPerGrid, 1, 1);
268
269 Span<const double> d_a_span = d_a.span();
270 Span<const double> d_b_span = d_b.span();
271 Span<double> d_out_view = d_out.span();
272
273 void* kernelArgs[] = {
274 (void*)&d_a_span,
275 (void*)&d_b_span,
276 (void*)&d_out_view
277 };
278 size_t smemSize = 0;
279 hipStream_t stream;
280 ARCANE_CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
281 ARCANE_CHECK_HIP(hipLaunchKernel((void*)MyVecAdd2, dimGrid, dimBlock, kernelArgs, smemSize, stream));
282 ARCANE_CHECK_HIP(hipStreamSynchronize(stream));
283 for (size_t i = 0; i < 10; ++i)
284 std::cout << "V2=" << d_out[i] << "\n";
285 }
286
287 // Lance une lambda
288 {
289 _initArrays(d_a, d_b, d_out, 3);
290 Span<const double> d_a_span = d_a.span();
291 Span<const double> d_b_span = d_b.span();
292 Span<double> d_out_span = d_out.span();
293 auto func = [=] ARCCORE_HOST_DEVICE(int i) {
294 d_out_span[i] = d_a_span[i] + d_b_span[i];
295 if (i<10){
296 //printf("A=%d %lf %lf %lf\n",i,d_a_span[i],d_b_span[i],d_out_span[i]);
297 } };
298
299 hipLaunchKernelGGL(MyVecLambda, blocksPerGrid, threadsPerBlock, 0, 0, vsize, func);
300 ARCANE_CHECK_HIP(hipDeviceSynchronize());
301 for (size_t i = 0; i < 10; ++i)
302 std::cout << "V3=" << d_out[i] << "\n";
303
304 _initArrays(d_a, d_b, d_out, 4);
305
306 // Appelle la version 'hote' de la lambda
307 for (int i = 0; i < vsize; ++i)
308 func(i);
309 for (size_t i = 0; i < 10; ++i)
310 std::cout << "V4=" << d_out[i] << "\n";
311 }
312
313 // Utilise les Real3
314 {
315 UniqueArray<Real3> d_a3(hip_allocator, vsize);
316 UniqueArray<Real3> d_b3(hip_allocator, vsize);
317 for (Integer i = 0; i < vsize; ++i) {
318 Real a = (Real)(i + 2);
319 Real b = (Real)(i * i + 3);
320
321 d_a3[i] = Real3(a, a + 1.0, a + 2.0);
322 d_b3[i] = Real3(b, b + 2.0, b + 3.0);
323 }
324
325 Span<const Real3> d_a3_span = d_a3.span();
326 Span<const Real3> d_b3_span = d_b3.span();
327 Span<double> d_out_span = d_out.span();
328 auto func2 = [=] ARCCORE_HOST_DEVICE(int i) {
329 d_out_span[i] = math::dot(d_a3_span[i], d_b3_span[i]);
330 if (i < 10) {
331 //printf("DOT=%d %lf\n",i,d_out_span[i]);
332 }
333 };
334 hipLaunchKernelGGL(MyVecLambda, blocksPerGrid, threadsPerBlock, 0, 0, vsize, func2);
335 ARCANE_CHECK_HIP(hipDeviceSynchronize());
336 std::cout << "TEST WITH REAL3\n";
337 }
338
339 return 0;
340}
341
342/*---------------------------------------------------------------------------*/
343/*---------------------------------------------------------------------------*/
344
345extern "C" int arcaneTestHipNumArray()
346{
347 std::cout << "TEST_HIP_NUM_ARRAY\n";
348 constexpr int vsize = 2000;
349 //IMemoryAllocator* hip_allocator = Arcane::Accelerator::Hip::getHipMemoryAllocator();
351 if (!hip_allocator2)
352 ARCANE_FATAL("platform::getAcceleratorHostMemoryAllocator() is null");
354 MyTestFunc1();
357 d_a.resize(vsize);
358 d_b.resize(vsize);
359 d_out.resize(vsize);
360 for (int i = 0; i < vsize; ++i) {
361 d_a(i) = (double)(i + 1);
362 d_b(i) = (double)(i * i + 1);
363 d_out(i) = 0.0; //a[i] + b[i];
364 }
365
366 int threadsPerBlock = 256;
367 int blocksPerGrid = (vsize + threadsPerBlock - 1) / threadsPerBlock;
368 std::cout << "CALLING kernel2 tpb=" << threadsPerBlock << " bpg=" << blocksPerGrid << "\n";
369 hipLaunchKernelGGL(MyVecAdd3, blocksPerGrid, threadsPerBlock, 0, 0, d_a, d_b, d_out);
370 ARCANE_CHECK_HIP(hipDeviceSynchronize());
371 hipError_t e = hipGetLastError();
372 std::cout << "END OF MYVEC1 e=" << e << " v=" << hipGetErrorString(e) << "\n";
373 for (int i = 0; i < 10; ++i)
374 std::cout << "V=" << d_out(i) << "\n";
375
376 // Lance un noyau dynamiquement
377 {
378 _initArrays(d_a, d_b, d_out, 2);
379
380 dim3 dimGrid(threadsPerBlock, 1, 1), dimBlock(blocksPerGrid, 1, 1);
381
384 MDSpan<double, MDDim1> d_out_view = d_out.mdspan();
385
386 void* kernelArgs[] = {
387 (void*)&d_a_span,
388 (void*)&d_b_span,
389 (void*)&d_out_view
390 };
391 size_t smemSize = 0;
392 hipStream_t stream;
393 ARCANE_CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
394 ARCANE_CHECK_HIP(hipLaunchKernel((void*)MyVecAdd2, dimGrid, dimBlock, kernelArgs, smemSize, stream));
395 ARCANE_CHECK_HIP(hipStreamSynchronize(stream));
396 for (int i = 0; i < 10; ++i)
397 std::cout << "V2=" << d_out(i) << "\n";
398 }
399
400 // Lance une lambda
401 {
402 _initArrays(d_a, d_b, d_out, 3);
405 MDSpan<double, MDDim1> d_out_span = d_out.mdspan();
406 auto func = [=] ARCCORE_HOST_DEVICE(int i) {
407 d_out_span(i) = d_a_span(i) + d_b_span(i);
408 if (i<10){
409 //printf("A=%d %lf %lf %lf\n",i,d_a_span(i),d_b_span(i),d_out_span(i));
410 } };
411
412 hipLaunchKernelGGL(MyVecLambda, blocksPerGrid, threadsPerBlock, 0, 0, vsize, func);
413 ARCANE_CHECK_HIP(hipDeviceSynchronize());
414 for (int i = 0; i < 10; ++i)
415 std::cout << "V3=" << d_out(i) << "\n";
416
417 _initArrays(d_a, d_b, d_out, 4);
418
419 // Appelle la version 'hote' de la lambda
420 for (int i = 0; i < vsize; ++i)
421 func(i);
422 for (int i = 0; i < 10; ++i)
423 std::cout << "V4=" << d_out(i) << "\n";
424 }
425
426 // Utilise les Real3 avec un tableau multi-dimensionel
427 {
428 NumArray<Real, MDDim2> d_a3(vsize, 3);
429 NumArray<Real, MDDim2> d_b3(vsize, 3);
430 for (Integer i = 0; i < vsize; ++i) {
431 Real a = (Real)(i + 2);
432 Real b = (Real)(i * i + 3);
433
434 d_a3(i, 0) = a;
435 d_a3(i, 1) = a + 1.0;
436 d_a3(i, 2) = a + 2.0;
437
438 d_b3(i, 0) = b;
439 d_b3(i, 1) = b + 1.0;
440 d_b3(i, 2) = b + 2.0;
441 }
442
443 MDSpan<const Real, MDDim2> d_a3_span = d_a3.constMDSpan();
444 MDSpan<const Real, MDDim2> d_b3_span = d_b3.constMDSpan();
445 MDSpan<double, MDDim1> d_out_span = d_out.mdspan();
446 auto func2 = [=] ARCCORE_HOST_DEVICE(int i) {
447 Real3 xa(d_a3_span(i, 0), d_a3_span(i, 1), d_a3_span(i, 2));
448 Real3 xb(d_b3_span(i, 0), d_b3_span(i, 1), d_b3_span(i, 2));
449 d_out_span(i) = math::dot(xa, xb);
450 if (i < 10) {
451 //printf("DOT NUMARRAY=%d %lf\n",i,d_out_span(i));
452 }
453 };
454 hipLaunchKernelGGL(MyVecLambda, blocksPerGrid, threadsPerBlock, 0, 0, vsize, func2);
455 ARCANE_CHECK_HIP(hipDeviceSynchronize());
456 std::cout << "TEST WITH REAL3\n";
457 }
458
459 return 0;
460}
461
462/*---------------------------------------------------------------------------*/
463/*---------------------------------------------------------------------------*/
464
466
467namespace ax = Arcane::Accelerator;
468
469void arcaneTestHipReductionX(int vsize, ax::RunQueue& queue, const String& name)
470{
471 using namespace Arcane::Accelerator;
472 std::cout << "TestReduction vsize=" << vsize << "\n";
474 UniqueArray<int> d_a(hip_allocator2, vsize);
475 UniqueArray<int> d_out(hip_allocator2, vsize);
476
477 for (Integer i = 0; i < vsize; ++i) {
478 int a = 5 + ((i + 2) % 43);
479 d_a[i] = a;
480 d_out[i] = 0;
481 //std::cout << "I=" << i << " a=" << a << "\n";
482 }
483 RunCommand command = makeCommand(queue);
484 ReducerSum<int> sum_reducer(command);
485 ReducerSum<double> sum_double_reducer(command);
486 ReducerMax<int> max_int_reducer(command);
487 ReducerMax<double> max_double_reducer(command);
488 ReducerMin<int> min_int_reducer(command);
489 ReducerMin<double> min_double_reducer(command);
490 Span<const int> xa = d_a.span();
491 Span<int> xout = d_out.span();
492 command << RUNCOMMAND_LOOP1(idx, vsize)
493 {
494 auto [i] = idx();
495 double vxa = (double)(xa[i]);
496 xout[i] = xa[i];
497 sum_reducer.add(xa[i]);
498 sum_double_reducer.add(vxa);
499 max_int_reducer.max(xa[i]);
500 max_double_reducer.max(vxa);
501 min_int_reducer.min(xa[i]);
502 min_double_reducer.min(vxa);
503 //if (i<10)
504 //printf("Do Reduce i=%d v=%d %lf\n",i,xa[i],vxa);
505 };
506
507 int sum_int_value = sum_reducer.reduce();
508 double sum_double_value = sum_double_reducer.reduce();
509 std::cout << "SumReducer name=" << name << " v_int=" << sum_int_value
510 << " v_double=" << sum_double_value
511 << "\n";
512 int max_int_value = max_int_reducer.reduce();
513 double max_double_value = max_double_reducer.reduce();
514 std::cout << "MaxReducer name=" << name << " v_int=" << max_int_value
515 << " v_double=" << max_double_value
516 << "\n";
517 int min_int_value = min_int_reducer.reduce();
518 double min_double_value = min_double_reducer.reduce();
519 std::cout << "MinReducer name=" << name << " v_int=" << min_int_value
520 << " v_double=" << min_double_value
521 << "\n";
522}
523
524#include <stdio.h>
525
526__device__ int my_add(int a, int b) {
527 return a + b;
528}
529
530__device__ int mul(int a, int b) {
531 return a * b;
532}
533
534// Pointeur de fonction sur le device
535//__device__ int (*op)(int, int) = &add;
536
537__global__ void compute(int *d_result, int N, int (*op_func)(int, int)) {
538 int idx = threadIdx.x + blockIdx.x * blockDim.x;
539 //if (idx == 0)
540 //printf("MyFuncDevice=%p\n",op_func);
541
542 if (idx < N) {
543 d_result[idx] = op_func(idx, idx);
544 }
545}
546using BinaryFuncType = int (*)(int a, int b);
547
548__device__ int (*my_func_ptr)(int a, int b) = my_add;
549
550__global__ void kernelSetFunction(BinaryFuncType* func_ptr)
551{
552 *func_ptr = my_add;
553 //printf("MyAddDevice=%p\n",my_add);
554}
555
557{
558 static __device__ int doFunc(int a, int b)
559 {
560 return a+b;
561 }
562};
563
565{
566 public:
567
568 //virtual ARCCORE_HOST_DEVICE ~FooBase() {}
569 virtual ARCCORE_HOST_DEVICE int apply(int a,int b) =0;
570};
571
573: public FooBase
574{
575 public:
576 ARCCORE_HOST_DEVICE int apply(int a,int b) override { return a+b;}
577};
578
579__global__ void compute_virtual(int* d_result, int N, FooBase* ptr)
580{
581 //FooBase* ptr = nullptr;
582 //FooDerived my_foo;
583 //ptr = &my_foo;
584 int idx = threadIdx.x + blockIdx.x * blockDim.x;
585 //if (idx == 0)
586 //printf("MyFuncDevice=%p\n",op_func);
587
588 if (idx < N) {
589 d_result[idx] = ptr->apply(idx, idx);
590 }
591}
592
593__global__ void createFooDerived(FooDerived* ptr)
594{
595 int idx = threadIdx.x + blockIdx.x * blockDim.x;
596 if (idx==0) {
597 new (ptr) FooDerived();
598 }
599}
600
601extern "C"
602int arcaneTestVirtualFunction()
603{
604 std::cout << "Test function pointer\n";
605 //std::cout << "FuncPtr direct=" << my_func_ptr << "\n";
606 std::cout.flush();
607
608 const int N = 10;
609 int h_result[N];
610 int* d_result;
611 FooDerived* foo_derived = nullptr;
612 ARCANE_CHECK_HIP(hipMalloc(&foo_derived, sizeof(FooDerived)));
613 createFooDerived<<<1, 1>>>(foo_derived);
614 ARCANE_CHECK_HIP(hipDeviceSynchronize());
615
616 ARCANE_CHECK_HIP(hipMalloc(&d_result, N * sizeof(int)));
617
618 int (*host_func)(int, int) = nullptr;
619 ARCANE_CHECK_HIP(hipMalloc(&host_func, sizeof(void*) * 8));
620
621 //my_func_ptr = my_add;
622 //cudaMemcpyFromSymbol(&host_func, my_func_ptr, sizeof(void*));
623 std::cout << "Set function pointer\n";
624 //kernelSetFunction<<<1, 1>>>(&host_func);
625
626 std::cout << "Wait end\n";
627 std::cout.flush();
628 ARCANE_CHECK_HIP(hipDeviceSynchronize());
629
630 std::cout << "Calling compute\n";
631 std::cout.flush();
632
633 // Appel du kernel
634 //compute<<<1, N>>>(d_result, N, host_func);
635 //compute<<<1, N>>>(d_result, N, my_func_ptr);
636 compute_virtual<<<1, N>>>(d_result, N, foo_derived);
637 ARCANE_CHECK_HIP(hipDeviceSynchronize());
638
639 //compute<<<1, N>>>(d_result, N, my_func_ptr);
640 ARCANE_CHECK_HIP(hipMemcpy(h_result, d_result, N * sizeof(int), hipMemcpyDeviceToHost));
641
642 for (int i = 0; i < N; ++i) {
643 printf("%d ", h_result[i]);
644 }
645 printf("\n");
646
647 ARCANE_CHECK_HIP(hipFree(d_result));
648 return 0;
649}
650
651/*---------------------------------------------------------------------------*/
652/*---------------------------------------------------------------------------*/
653
654extern "C" int arcaneTestHipReduction()
655{
656 // TODO: tester en ne commancant pas par 0.
657 std::cout << "Test Reductions\n";
661 ax::RunQueue queue1{ makeQueue(runner_seq) };
662 ax::RunQueue queue2{ makeQueue(runner_thread) };
663 ax::RunQueue queue3{ makeQueue(runner_hip) };
664 int sizes_to_test[] = { 56, 567, 4389, 452182 };
665 for (int i = 0; i < 4; ++i) {
666 int vsize = sizes_to_test[i];
667 arcaneTestHipReductionX(vsize, queue1, "Sequential");
668 arcaneTestHipReductionX(vsize, queue2, "Thread");
669 arcaneTestHipReductionX(vsize, queue3, "HIP");
670 }
671 return 0;
672}
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Fonctions mathématiques diverses.
Types et fonctions pour gérer les synchronisations sur les accélérateurs.
Types et macros pour gérer les boucles sur les accélérateurs.
#define RUNCOMMAND_LOOP1(iter_name, x1,...)
Boucle 1D sur accélérateur avec arguments supplémentaires.
Fonctions de gestion mémoire et des allocateurs.
Classe pour effectuer une réduction 'max'.
Definition Reduce.h:683
Classe pour effectuer une réduction 'min'.
Definition Reduce.h:718
Classe pour effectuer une réduction 'somme'.
Definition Reduce.h:651
Gestion d'une commande sur accélérateur.
File d'exécution pour un accélérateur.
Gestionnaire d'exécution pour accélérateur.
Definition core/Runner.h:67
Interface d'un allocateur pour la mémoire.
Classe de base des vues multi-dimensionnelles.
Tableaux multi-dimensionnels pour les types numériques accessibles sur accélérateurs.
MDSpanType span()
Vue multi-dimension sur l'instance.
MDSpanType mdspan()
Vue multi-dimension sur l'instance.
ConstMDSpanType constMDSpan() const
Vue constante multi-dimension sur l'instance.
void resize(Int32 dim1_size)
Modifie la taille du tableau en gardant pas les valeurs actuelles.
Classe gérant un vecteur de réel de dimension 3.
Definition Real3.h:132
Vue d'un tableau d'éléments de type T.
Definition Span.h:612
Chaîne de caractères unicode.
Vecteur 1D de données avec sémantique par valeur (style STL).
__host__ __device__ Real dot(Real2 u, Real2 v)
Produit scalaire de u par v dans .
Definition MathUtils.h:96
Espace de nom pour l'utilisation des accélérateurs.
RunCommand makeCommand(const RunQueue &run_queue)
Créé une commande associée à la file run_queue.
RunQueue makeQueue(const Runner &runner)
Créé une file associée à runner.
@ HIP
Politique d'exécution utilisant l'environnement HIP.
@ Sequential
Politique d'exécution séquentielle.
@ Thread
Politique d'exécution multi-thread.
ARCCORE_COMMON_EXPORT IMemoryAllocator * getDefaultDataAllocator()
Allocateur par défaut pour les données.
-*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
Int32 Integer
Type représentant un entier.
Espace de nom de Arccore.