Arcane  v4.1.0.0
Documentation développeur
Chargement...
Recherche...
Aucune correspondance
Test.cu.cc
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2025 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* Test.cu.cc (C) 2000-2025 */
9/* */
10/* Fichier contenant les tests pour l'implémentation HIP. */
11/*---------------------------------------------------------------------------*/
12/*---------------------------------------------------------------------------*/
13
14#include <vector>
15#include <iostream>
16
17#include <hip/hip_runtime.h>
18
19#include "arcane/utils/PlatformUtils.h"
20#include "arcane/utils/NotSupportedException.h"
21#include "arcane/utils/Real3.h"
23
24#include "arcane/core/Item.h"
26
27#include "arcane/accelerator/hip/HipAccelerator.h"
28#include "arcane/accelerator/Runner.h"
29#include "arcane/accelerator/RunQueue.h"
31#include "arcane/accelerator/NumArray.h"
32
33using namespace Arccore;
34using namespace Arcane;
35using namespace Arcane::Accelerator;
36
37__device__ __forceinline__ unsigned int getGlobalIdx_1D_1D()
38{
39 unsigned int blockId = blockIdx.x;
40 unsigned int threadId = blockId * blockDim.x + threadIdx.x;
41 return threadId;
42}
43
44template <typename T>
45struct Privatizer
46{
47 using value_type = T; //std::decay<T>;
48 using reference_type = value_type&;
49 value_type priv;
50
51 ARCCORE_HOST_DEVICE Privatizer(const T& o) : priv{o} {}
52 ARCCORE_HOST_DEVICE reference_type get_priv() { return priv; }
53};
54
55template <typename T>
56ARCCORE_HOST_DEVICE auto thread_privatize(const T& item) -> Privatizer<T>
57{
58 return Privatizer<T>{item};
59}
60
61__global__ void MyVecAdd(double* a,double* b,double* out)
62{
63 int i = blockDim.x * blockIdx.x + threadIdx.x;
64 out[i] = a[i] + b[i];
65 if (i<10){
66 // printf("A=%d %lf %lf %lf %d\n",i,a[i],b[i],out[i],3);
67 }
68}
69
70__global__ void MyVecAdd2(Span<const double> a,Span<const double>b,Span<double> out)
71{
72 Int64 size = a.size();
73 Int64 i = blockDim.x * blockIdx.x + threadIdx.x;
74 if (i>=size)
75 return;
76 out[i] = a[i] + b[i];
77 if (i<10){
78 //printf("A=%d %lf %lf %lf %d\n",i,a[i],b[i],out[i],i);
79 }
80}
81
83{
84 Int32 size = static_cast<Int32>(a.extent0());
85 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
86 if (i>=size)
87 return;
88 out(i) = a(i) + b(i);
89 if (i<10){
90 //printf("A=%d %lf %lf %lf %d\n",i,a(i),b(i),out(i),i);
91 }
92}
93
94void _initArrays(Span<double> a,Span<double> b,Span<double> c,int base)
95{
96 Int64 vsize = a.size();
97 for( Int64 i = 0; i<vsize; ++i ){
98 a[i] = (double)(i+base);
99 b[i] = (double)(i*i+base);
100 c[i] = 0.0;
101 }
102}
103
105{
106 Int32 vsize = static_cast<Int32>(a.extent0());
107 for( Int32 i = 0; i<vsize; ++i ){
108 a(i) = (double)(i+base);
109 b(i) = (double)(i*i+base);
110 c(i) = 0.0;
111 }
112}
113
114template<typename F> __global__
115void MyVecLambda(int size,F func)
116{
117 auto privatizer = thread_privatize(func);
118 auto& body = privatizer.get_priv();
119
120 int i = blockDim.x * blockIdx.x + threadIdx.x;
121 if (i<size)
122 body(i);
123}
124
125namespace TestCuda
126{
127class IA
128{
129 virtual __device__ __host__ void DoIt2() =0;
130};
131
132class A
133: public IA
134{
135 public:
136 //__global__ void DoIt(){}
137 virtual __device__ __host__ void DoIt2() override {}
138};
139}
140
141void MyTestFunc1()
142{
143 struct Context1
144 {
145 int a;
146 };
147 auto k = [=](Context1& ctx){ std::cout << "A=" << ctx.a << "\n"; };
148 Context1 my_ctx;
149 my_ctx.a = 3;
150 k(my_ctx);
151}
152
153extern "C"
154int arcaneTestHip1()
155{
156 constexpr int vsize = 2000;
157 std::vector<double> a(vsize);
158 std::vector<double> b(vsize);
159 std::vector<double> out(vsize);
160 for( size_t i = 0; i<vsize; ++i ){
161 a[i] = (double)(i+1);
162 b[i] = (double)(i*i+1);
163 out[i] = 0.0; //a[i] + b[i];
164 }
165 size_t mem_size = vsize*sizeof(double);
166 double* d_a = nullptr;
167 ARCANE_CHECK_HIP(hipMalloc(&d_a,mem_size));
168 double* d_b = nullptr;
169 ARCANE_CHECK_HIP(hipMalloc(&d_b,mem_size));
170 double* d_out = nullptr;
171 ARCANE_CHECK_HIP(hipMalloc(&d_out,mem_size));
172
173 ARCANE_CHECK_HIP(hipMemcpy(d_a, a.data(), mem_size, hipMemcpyHostToDevice));
174 ARCANE_CHECK_HIP(hipMemcpy(d_b, b.data(), mem_size, hipMemcpyHostToDevice));
175 int threadsPerBlock = 256;
176 int blocksPerGrid = (vsize + threadsPerBlock - 1) / threadsPerBlock;
177 std::cout << "CALLING kernel tpb=" << threadsPerBlock << " bpg=" << blocksPerGrid << "\n";
178 hipLaunchKernelGGL(MyVecAdd, blocksPerGrid, threadsPerBlock , 0, 0, d_a,d_b,d_out);
179 ARCANE_CHECK_HIP(hipDeviceSynchronize());
180 ARCANE_CHECK_HIP(hipMemcpy(out.data(), d_out, mem_size, hipMemcpyDeviceToHost));
181 for( size_t i=0; i<10; ++i )
182 std::cout << "V=" << out[i] << "\n";
183 return 0;
184}
185
186extern "C"
187int arcaneTestHip2()
188{
189 MyTestFunc1();
190 constexpr int vsize = 2000;
191 size_t mem_size = vsize*sizeof(double);
192 double* d_a = nullptr;
193 ARCANE_CHECK_HIP(hipMallocManaged(&d_a,mem_size,hipMemAttachGlobal));
194 double* d_b = nullptr;
195 ARCANE_CHECK_HIP(hipMallocManaged(&d_b,mem_size,hipMemAttachGlobal));
196 double* d_out = nullptr;
197 ARCANE_CHECK_HIP(hipMallocManaged(&d_out,mem_size,hipMemAttachGlobal));
198
199 //d_a = new double[vsize];
200 //d_b = new double[vsize];
201 //d_out = new double[vsize];
202
203 for( size_t i = 0; i<vsize; ++i ){
204 d_a[i] = (double)(i+1);
205 d_b[i] = (double)(i*i+1);
206 d_out[i] = 0.0; //a[i] + b[i];
207 }
208
209
210 //hipMemcpy(d_a, a.data(), mem_size, hipMemcpyHostToDevice);
211 //hipMemcpy(d_b, b.data(), mem_size, hipMemcpyHostToDevice);
212 int threadsPerBlock = 256;
213 int blocksPerGrid = (vsize + threadsPerBlock - 1) / threadsPerBlock;
214 std::cout << "CALLING kernel2 tpb=" << threadsPerBlock << " bpg=" << blocksPerGrid << "\n";
215 hipLaunchKernelGGL(MyVecAdd, blocksPerGrid, threadsPerBlock, 0, 0, d_a,d_b,d_out);
216 ARCANE_CHECK_HIP(hipDeviceSynchronize());
217 hipError_t e = hipGetLastError();
218 std::cout << "END OF MYVEC1 e=" << e << " v=" << hipGetErrorString(e) << "\n";
219 //hipDeviceSynchronize();
220 //e = hipGetLastError();
221 //std::cout << "END OF MYVEC2 e=" << e << " v=" << hipGetErrorString(e) << "\n";
222 //hipMemcpy(out.data(), d_out, mem_size, hipMemcpyDeviceToHost);
223 //e = hipGetLastError();
224 //std::cout << "END OF MYVEC3 e=" << e << " v=" << hipGetErrorString(e) << "\n";
225 for( size_t i=0; i<10; ++i )
226 std::cout << "V=" << d_out[i] << "\n";
227
228 return 0;
229}
230
231/*---------------------------------------------------------------------------*/
232/*---------------------------------------------------------------------------*/
233
234extern "C" int arcaneTestHip3()
235{
236 std::cout << "TEST_HIP_3\n";
237 constexpr int vsize = 2000;
238 IMemoryAllocator* hip_allocator = Arcane::Accelerator::Hip::getHipMemoryAllocator();
240 if (!hip_allocator2)
241 ARCANE_FATAL("platform::getAcceleratorHostMemoryAllocator() is null");
242 UniqueArray<double> d_a(hip_allocator, vsize);
243 MyTestFunc1();
244 UniqueArray<double> d_b(hip_allocator, vsize);
245 UniqueArray<double> d_out(hip_allocator, vsize);
246
247 for (size_t i = 0; i < vsize; ++i) {
248 d_a[i] = (double)(i + 1);
249 d_b[i] = (double)(i * i + 1);
250 d_out[i] = 0.0; //a[i] + b[i];
251 }
252
253 int threadsPerBlock = 256;
254 int blocksPerGrid = (vsize + threadsPerBlock - 1) / threadsPerBlock;
255 std::cout << "CALLING kernel2 tpb=" << threadsPerBlock << " bpg=" << blocksPerGrid << "\n";
256 hipLaunchKernelGGL(MyVecAdd2, blocksPerGrid, threadsPerBlock, 0, 0, d_a, d_b, d_out);
257 ARCANE_CHECK_HIP(hipDeviceSynchronize());
258 hipError_t e = hipGetLastError();
259 std::cout << "END OF MYVEC1 e=" << e << " v=" << hipGetErrorString(e) << "\n";
260 for (size_t i = 0; i < 10; ++i)
261 std::cout << "V=" << d_out[i] << "\n";
262
263 // Lance un noyau dynamiquement
264 {
265 _initArrays(d_a, d_b, d_out, 2);
266
267 dim3 dimGrid(threadsPerBlock, 1, 1), dimBlock(blocksPerGrid, 1, 1);
268
269 Span<const double> d_a_span = d_a.span();
270 Span<const double> d_b_span = d_b.span();
271 Span<double> d_out_view = d_out.span();
272
273 void* kernelArgs[] = {
274 (void*)&d_a_span,
275 (void*)&d_b_span,
276 (void*)&d_out_view
277 };
278 size_t smemSize = 0;
279 hipStream_t stream;
280 ARCANE_CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
281 ARCANE_CHECK_HIP(hipLaunchKernel((void*)MyVecAdd2, dimGrid, dimBlock, kernelArgs, smemSize, stream));
282 ARCANE_CHECK_HIP(hipStreamSynchronize(stream));
283 for (size_t i = 0; i < 10; ++i)
284 std::cout << "V2=" << d_out[i] << "\n";
285 }
286
287 // Lance une lambda
288 {
289 _initArrays(d_a, d_b, d_out, 3);
290 Span<const double> d_a_span = d_a.span();
291 Span<const double> d_b_span = d_b.span();
292 Span<double> d_out_span = d_out.span();
293 auto func = [=] ARCCORE_HOST_DEVICE(int i) {
294 d_out_span[i] = d_a_span[i] + d_b_span[i];
295 if (i<10){
296 //printf("A=%d %lf %lf %lf\n",i,d_a_span[i],d_b_span[i],d_out_span[i]);
297 } };
298
299 hipLaunchKernelGGL(MyVecLambda, blocksPerGrid, threadsPerBlock, 0, 0, vsize, func);
300 ARCANE_CHECK_HIP(hipDeviceSynchronize());
301 for (size_t i = 0; i < 10; ++i)
302 std::cout << "V3=" << d_out[i] << "\n";
303
304 _initArrays(d_a, d_b, d_out, 4);
305
306 // Appelle la version 'hote' de la lambda
307 for (int i = 0; i < vsize; ++i)
308 func(i);
309 for (size_t i = 0; i < 10; ++i)
310 std::cout << "V4=" << d_out[i] << "\n";
311 }
312
313 // Utilise les Real3
314 {
315 UniqueArray<Real3> d_a3(hip_allocator, vsize);
316 UniqueArray<Real3> d_b3(hip_allocator, vsize);
317 for (Integer i = 0; i < vsize; ++i) {
318 Real a = (Real)(i + 2);
319 Real b = (Real)(i * i + 3);
320
321 d_a3[i] = Real3(a, a + 1.0, a + 2.0);
322 d_b3[i] = Real3(b, b + 2.0, b + 3.0);
323 }
324
325 Span<const Real3> d_a3_span = d_a3.span();
326 Span<const Real3> d_b3_span = d_b3.span();
327 Span<double> d_out_span = d_out.span();
328 auto func2 = [=] ARCCORE_HOST_DEVICE(int i) {
329 d_out_span[i] = math::dot(d_a3_span[i], d_b3_span[i]);
330 if (i < 10) {
331 //printf("DOT=%d %lf\n",i,d_out_span[i]);
332 }
333 };
334 hipLaunchKernelGGL(MyVecLambda, blocksPerGrid, threadsPerBlock, 0, 0, vsize, func2);
335 ARCANE_CHECK_HIP(hipDeviceSynchronize());
336 std::cout << "TEST WITH REAL3\n";
337 }
338
339 return 0;
340}
341
342/*---------------------------------------------------------------------------*/
343/*---------------------------------------------------------------------------*/
344
345extern "C" int arcaneTestHipNumArray()
346{
347 std::cout << "TEST_HIP_NUM_ARRAY\n";
348 constexpr int vsize = 2000;
349 //IMemoryAllocator* hip_allocator = Arcane::Accelerator::Hip::getHipMemoryAllocator();
351 if (!hip_allocator2)
352 ARCANE_FATAL("platform::getAcceleratorHostMemoryAllocator() is null");
354 MyTestFunc1();
357 d_a.resize(vsize);
358 d_b.resize(vsize);
359 d_out.resize(vsize);
360 for (int i = 0; i < vsize; ++i) {
361 d_a(i) = (double)(i + 1);
362 d_b(i) = (double)(i * i + 1);
363 d_out(i) = 0.0; //a[i] + b[i];
364 }
365
366 int threadsPerBlock = 256;
367 int blocksPerGrid = (vsize + threadsPerBlock - 1) / threadsPerBlock;
368 std::cout << "CALLING kernel2 tpb=" << threadsPerBlock << " bpg=" << blocksPerGrid << "\n";
369 hipLaunchKernelGGL(MyVecAdd3, blocksPerGrid, threadsPerBlock, 0, 0, d_a, d_b, d_out);
370 ARCANE_CHECK_HIP(hipDeviceSynchronize());
371 hipError_t e = hipGetLastError();
372 std::cout << "END OF MYVEC1 e=" << e << " v=" << hipGetErrorString(e) << "\n";
373 for (int i = 0; i < 10; ++i)
374 std::cout << "V=" << d_out(i) << "\n";
375
376 // Lance un noyau dynamiquement
377 {
378 _initArrays(d_a, d_b, d_out, 2);
379
380 dim3 dimGrid(threadsPerBlock, 1, 1), dimBlock(blocksPerGrid, 1, 1);
381
384 MDSpan<double, MDDim1> d_out_view = d_out.mdspan();
385
386 void* kernelArgs[] = {
387 (void*)&d_a_span,
388 (void*)&d_b_span,
389 (void*)&d_out_view
390 };
391 size_t smemSize = 0;
392 hipStream_t stream;
393 ARCANE_CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
394 ARCANE_CHECK_HIP(hipLaunchKernel((void*)MyVecAdd2, dimGrid, dimBlock, kernelArgs, smemSize, stream));
395 ARCANE_CHECK_HIP(hipStreamSynchronize(stream));
396 for (int i = 0; i < 10; ++i)
397 std::cout << "V2=" << d_out(i) << "\n";
398 }
399
400 // Lance une lambda
401 {
402 _initArrays(d_a, d_b, d_out, 3);
405 MDSpan<double, MDDim1> d_out_span = d_out.mdspan();
406 auto func = [=] ARCCORE_HOST_DEVICE(int i) {
407 d_out_span(i) = d_a_span(i) + d_b_span(i);
408 if (i<10){
409 //printf("A=%d %lf %lf %lf\n",i,d_a_span(i),d_b_span(i),d_out_span(i));
410 } };
411
412 hipLaunchKernelGGL(MyVecLambda, blocksPerGrid, threadsPerBlock, 0, 0, vsize, func);
413 ARCANE_CHECK_HIP(hipDeviceSynchronize());
414 for (int i = 0; i < 10; ++i)
415 std::cout << "V3=" << d_out(i) << "\n";
416
417 _initArrays(d_a, d_b, d_out, 4);
418
419 // Appelle la version 'hote' de la lambda
420 for (int i = 0; i < vsize; ++i)
421 func(i);
422 for (int i = 0; i < 10; ++i)
423 std::cout << "V4=" << d_out(i) << "\n";
424 }
425
426 // Utilise les Real3 avec un tableau multi-dimensionel
427 {
428 NumArray<Real, MDDim2> d_a3(vsize, 3);
429 NumArray<Real, MDDim2> d_b3(vsize, 3);
430 for (Integer i = 0; i < vsize; ++i) {
431 Real a = (Real)(i + 2);
432 Real b = (Real)(i * i + 3);
433
434 d_a3(i, 0) = a;
435 d_a3(i, 1) = a + 1.0;
436 d_a3(i, 2) = a + 2.0;
437
438 d_b3(i, 0) = b;
439 d_b3(i, 1) = b + 1.0;
440 d_b3(i, 2) = b + 2.0;
441 }
442
443 MDSpan<const Real, MDDim2> d_a3_span = d_a3.constMDSpan();
444 MDSpan<const Real, MDDim2> d_b3_span = d_b3.constMDSpan();
445 MDSpan<double, MDDim1> d_out_span = d_out.mdspan();
446 auto func2 = [=] ARCCORE_HOST_DEVICE(int i) {
447 Real3 xa(d_a3_span(i, 0), d_a3_span(i, 1), d_a3_span(i, 2));
448 Real3 xb(d_b3_span(i, 0), d_b3_span(i, 1), d_b3_span(i, 2));
449 d_out_span(i) = math::dot(xa, xb);
450 if (i < 10) {
451 //printf("DOT NUMARRAY=%d %lf\n",i,d_out_span(i));
452 }
453 };
454 hipLaunchKernelGGL(MyVecLambda, blocksPerGrid, threadsPerBlock, 0, 0, vsize, func2);
455 ARCANE_CHECK_HIP(hipDeviceSynchronize());
456 std::cout << "TEST WITH REAL3\n";
457 }
458
459 return 0;
460}
461
462/*---------------------------------------------------------------------------*/
463/*---------------------------------------------------------------------------*/
464
466
467namespace ax = Arcane::Accelerator;
468
469void arcaneTestHipReductionX(int vsize, ax::RunQueue& queue, const String& name)
470{
471 using namespace Arcane::Accelerator;
472 std::cout << "TestReduction vsize=" << vsize << "\n";
474 UniqueArray<int> d_a(hip_allocator2, vsize);
475 UniqueArray<int> d_out(hip_allocator2, vsize);
476
477 for (Integer i = 0; i < vsize; ++i) {
478 int a = 5 + ((i + 2) % 43);
479 d_a[i] = a;
480 d_out[i] = 0;
481 //std::cout << "I=" << i << " a=" << a << "\n";
482 }
483 RunCommand command = makeCommand(queue);
484 ReducerSum<int> sum_reducer(command);
485 ReducerSum<double> sum_double_reducer(command);
486 ReducerMax<int> max_int_reducer(command);
487 ReducerMax<double> max_double_reducer(command);
488 ReducerMin<int> min_int_reducer(command);
489 ReducerMin<double> min_double_reducer(command);
490 Span<const int> xa = d_a.span();
491 Span<int> xout = d_out.span();
492 command << RUNCOMMAND_LOOP1(idx, vsize)
493 {
494 auto [i] = idx();
495 double vxa = (double)(xa[i]);
496 xout[i] = xa[i];
497 sum_reducer.add(xa[i]);
498 sum_double_reducer.add(vxa);
499 max_int_reducer.max(xa[i]);
500 max_double_reducer.max(vxa);
501 min_int_reducer.min(xa[i]);
502 min_double_reducer.min(vxa);
503 //if (i<10)
504 //printf("Do Reduce i=%d v=%d %lf\n",i,xa[i],vxa);
505 };
506
507 int sum_int_value = sum_reducer.reduce();
508 double sum_double_value = sum_double_reducer.reduce();
509 std::cout << "SumReducer name=" << name << " v_int=" << sum_int_value
510 << " v_double=" << sum_double_value
511 << "\n";
512 int max_int_value = max_int_reducer.reduce();
513 double max_double_value = max_double_reducer.reduce();
514 std::cout << "MaxReducer name=" << name << " v_int=" << max_int_value
515 << " v_double=" << max_double_value
516 << "\n";
517 int min_int_value = min_int_reducer.reduce();
518 double min_double_value = min_double_reducer.reduce();
519 std::cout << "MinReducer name=" << name << " v_int=" << min_int_value
520 << " v_double=" << min_double_value
521 << "\n";
522}
523
524#include <stdio.h>
525
526__device__ int my_add(int a, int b) {
527 return a + b;
528}
529
530__device__ int mul(int a, int b) {
531 return a * b;
532}
533
534// Pointeur de fonction sur le device
535//__device__ int (*op)(int, int) = &add;
536
537__global__ void compute(int *d_result, int N, int (*op_func)(int, int)) {
538 int idx = threadIdx.x + blockIdx.x * blockDim.x;
539 //if (idx == 0)
540 //printf("MyFuncDevice=%p\n",op_func);
541
542 if (idx < N) {
543 d_result[idx] = op_func(idx, idx);
544 }
545}
546using BinaryFuncType = int (*)(int a, int b);
547
548__device__ int (*my_func_ptr)(int a, int b) = my_add;
549
550__global__ void kernelSetFunction(BinaryFuncType* func_ptr)
551{
552 *func_ptr = my_add;
553 //printf("MyAddDevice=%p\n",my_add);
554}
555
557{
558 static __device__ int doFunc(int a, int b)
559 {
560 return a+b;
561 }
562};
563
565{
566 public:
567
568 //virtual ARCCORE_HOST_DEVICE ~FooBase() {}
569 virtual ARCCORE_HOST_DEVICE int apply(int a,int b) =0;
570};
571
573: public FooBase
574{
575 public:
576 ARCCORE_HOST_DEVICE int apply(int a,int b) override { return a+b;}
577};
578
579__global__ void compute_virtual(int* d_result, int N, FooBase* ptr)
580{
581 //FooBase* ptr = nullptr;
582 //FooDerived my_foo;
583 //ptr = &my_foo;
584 int idx = threadIdx.x + blockIdx.x * blockDim.x;
585 //if (idx == 0)
586 //printf("MyFuncDevice=%p\n",op_func);
587
588 if (idx < N) {
589 d_result[idx] = ptr->apply(idx, idx);
590 }
591}
592
593__global__ void createFooDerived(FooDerived* ptr)
594{
595 int idx = threadIdx.x + blockIdx.x * blockDim.x;
596 if (idx==0) {
597 new (ptr) FooDerived();
598 }
599}
600
601extern "C"
602int arcaneTestVirtualFunction()
603{
604 std::cout << "Test function pointer\n";
605 //std::cout << "FuncPtr direct=" << my_func_ptr << "\n";
606 std::cout.flush();
607
608 const int N = 10;
609 int h_result[N];
610 int* d_result;
611 FooDerived* foo_derived = nullptr;
612 ARCANE_CHECK_HIP(hipMalloc(&foo_derived, sizeof(FooDerived)));
613 createFooDerived<<<1, 1>>>(foo_derived);
614 ARCANE_CHECK_HIP(hipDeviceSynchronize());
615
616 ARCANE_CHECK_HIP(hipMalloc(&d_result, N * sizeof(int)));
617
618 int (*host_func)(int, int) = nullptr;
619 ARCANE_CHECK_HIP(hipMalloc(&host_func, sizeof(void*) * 8));
620
621 //my_func_ptr = my_add;
622 //cudaMemcpyFromSymbol(&host_func, my_func_ptr, sizeof(void*));
623 std::cout << "Set function pointer\n";
624 //kernelSetFunction<<<1, 1>>>(&host_func);
625
626 std::cout << "Wait end\n";
627 std::cout.flush();
628 ARCANE_CHECK_HIP(hipDeviceSynchronize());
629
630 std::cout << "Calling compute\n";
631 std::cout.flush();
632
633 // Appel du kernel
634 //compute<<<1, N>>>(d_result, N, host_func);
635 //compute<<<1, N>>>(d_result, N, my_func_ptr);
636 compute_virtual<<<1, N>>>(d_result, N, foo_derived);
637 ARCANE_CHECK_HIP(hipDeviceSynchronize());
638
639 //compute<<<1, N>>>(d_result, N, my_func_ptr);
640 ARCANE_CHECK_HIP(hipMemcpy(h_result, d_result, N * sizeof(int), hipMemcpyDeviceToHost));
641
642 for (int i = 0; i < N; ++i) {
643 printf("%d ", h_result[i]);
644 }
645 printf("\n");
646
647 ARCANE_CHECK_HIP(hipFree(d_result));
648 return 0;
649}
650
651/*---------------------------------------------------------------------------*/
652/*---------------------------------------------------------------------------*/
653
654extern "C" int arcaneTestHipReduction()
655{
656 // TODO: tester en ne commancant pas par 0.
657 std::cout << "Test Reductions\n";
661 ax::RunQueue queue1{ makeQueue(runner_seq) };
662 ax::RunQueue queue2{ makeQueue(runner_thread) };
663 ax::RunQueue queue3{ makeQueue(runner_hip) };
664 int sizes_to_test[] = { 56, 567, 4389, 452182 };
665 for (int i = 0; i < 4; ++i) {
666 int vsize = sizes_to_test[i];
667 arcaneTestHipReductionX(vsize, queue1, "Sequential");
668 arcaneTestHipReductionX(vsize, queue2, "Thread");
669 arcaneTestHipReductionX(vsize, queue3, "HIP");
670 }
671 return 0;
672}
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Fonctions mathématiques diverses.
Fonctions de gestion mémoire et des allocateurs.
Types et fonctions pour gérer les synchronisations sur les accélérateurs.
Types et macros pour gérer les boucles sur les accélérateurs.
#define RUNCOMMAND_LOOP1(iter_name, x1,...)
Boucle sur accélérateur avec arguments supplémentaires pour les réductions.
Classe pour effectuer une réduction 'max'.
Definition Reduce.h:680
Classe pour effectuer une réduction 'min'.
Definition Reduce.h:715
Classe pour effectuer une réduction 'somme'.
Definition Reduce.h:648
Gestion d'une commande sur accélérateur.
File d'exécution pour un accélérateur.
Gestionnaire d'exécution pour accélérateur.
Definition core/Runner.h:68
Classe de base des vues multi-dimensionnelles.
Tableaux multi-dimensionnels pour les types numériques accessibles sur accélérateurs.
MDSpanType span()
Vue multi-dimension sur l'instance.
MDSpanType mdspan()
Vue multi-dimension sur l'instance.
ConstMDSpanType constMDSpan() const
Vue constante multi-dimension sur l'instance.
void resize(Int32 dim1_size)
Modifie la taille du tableau en gardant pas les valeurs actuelles.
Classe gérant un vecteur de réel de dimension 3.
Definition Real3.h:132
Vue d'un tableau d'éléments de type T.
Definition Span.h:613
Chaîne de caractères unicode.
Vecteur 1D de données avec sémantique par valeur (style STL).
__host__ __device__ Real dot(Real2 u, Real2 v)
Produit scalaire de u par v dans .
Definition MathUtils.h:96
Espace de nom pour l'utilisation des accélérateurs.
RunCommand makeCommand(const RunQueue &run_queue)
Créé une commande associée à la file run_queue.
RunQueue makeQueue(const Runner &runner)
Créé une file associée à runner.
@ HIP
Politique d'exécution utilisant l'environnement HIP.
@ Sequential
Politique d'exécution séquentielle.
@ Thread
Politique d'exécution multi-thread.
IMemoryAllocator * getDefaultDataAllocator()
Allocateur par défaut pour les données.
-*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
Int32 Integer
Type représentant un entier.
Espace de nom de Arccore.