Arcane  4.1.12.0
Developer documentation
Loading...
Searching...
No Matches
Test.cu.cc
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* Test.cu.cc (C) 2000-2025 */
9/* */
10/* File containing tests for the HIP implementation. */
11/*---------------------------------------------------------------------------*/
12/*---------------------------------------------------------------------------*/
13
14#include <vector>
15#include <iostream>
16
17#include <hip/hip_runtime.h>
18
19#include "arcane/utils/PlatformUtils.h"
20#include "arcane/utils/NotSupportedException.h"
21#include "arcane/utils/Real3.h"
23
24#include "arcane/core/Item.h"
26
27#include "arccore/accelerator_native/HipAccelerator.h"
28#include "arcane/accelerator/Runner.h"
29#include "arcane/accelerator/RunQueue.h"
30#include "arcane/accelerator/RunCommandLoop.h"
31#include "arcane/accelerator/NumArray.h"
32
33using namespace Arccore;
34using namespace Arcane;
35using namespace Arcane::Accelerator;
36
37__device__ __forceinline__ unsigned int getGlobalIdx_1D_1D()
38{
39 unsigned int blockId = blockIdx.x;
40 unsigned int threadId = blockId * blockDim.x + threadIdx.x;
41 return threadId;
42}
43
44template <typename T>
45struct Privatizer
46{
47 using value_type = T; //std::decay<T>;
48 using reference_type = value_type&;
49 value_type priv;
50
51 ARCCORE_HOST_DEVICE Privatizer(const T& o)
52 : priv{ o }
53 {}
54 ARCCORE_HOST_DEVICE reference_type get_priv() { return priv; }
55};
56
57template <typename T>
58ARCCORE_HOST_DEVICE auto thread_privatize(const T& item) -> Privatizer<T>
59{
60 return Privatizer<T>{ item };
61}
62
63__global__ void MyVecAdd(double* a, double* b, double* out)
64{
65 int i = blockDim.x * blockIdx.x + threadIdx.x;
66 out[i] = a[i] + b[i];
67 if (i < 10) {
68 // printf("A=%d %lf %lf %lf %d\n",i,a[i],b[i],out[i],3);
69 }
70}
71
72__global__ void MyVecAdd2(Span<const double> a, Span<const double> b, Span<double> out)
73{
74 Int64 size = a.size();
75 Int64 i = blockDim.x * blockIdx.x + threadIdx.x;
76 if (i >= size)
77 return;
78 out[i] = a[i] + b[i];
79 if (i < 10) {
80 //printf("A=%d %lf %lf %lf %d\n",i,a[i],b[i],out[i],i);
81 }
82}
83
85{
86 Int32 size = static_cast<Int32>(a.extent0());
87 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
88 if (i >= size)
89 return;
90 out(i) = a(i) + b(i);
91 if (i < 10) {
92 //printf("A=%d %lf %lf %lf %d\n",i,a(i),b(i),out(i),i);
93 }
94}
95
96void _initArrays(Span<double> a, Span<double> b, Span<double> c, int base)
97{
98 Int64 vsize = a.size();
99 for (Int64 i = 0; i < vsize; ++i) {
100 a[i] = (double)(i + base);
101 b[i] = (double)(i * i + base);
102 c[i] = 0.0;
103 }
104}
105
107{
108 Int32 vsize = static_cast<Int32>(a.extent0());
109 for (Int32 i = 0; i < vsize; ++i) {
110 a(i) = (double)(i + base);
111 b(i) = (double)(i * i + base);
112 c(i) = 0.0;
113 }
114}
115
116template <typename F> __global__ void MyVecLambda(int size, F func)
117{
118 auto privatizer = thread_privatize(func);
119 auto& body = privatizer.get_priv();
120
121 int i = blockDim.x * blockIdx.x + threadIdx.x;
122 if (i < size)
123 body(i);
124}
125
126namespace TestCuda
127{
128class IA
129{
130 virtual __device__ __host__ void DoIt2() = 0;
131};
132
133class A
134: public IA
135{
136 public:
137
138 //__global__ void DoIt(){}
139 virtual __device__ __host__ void DoIt2() override {}
140};
141} // namespace TestCuda
142
143void MyTestFunc1()
144{
145 struct Context1
146 {
147 int a;
148 };
149 auto k = [=](Context1& ctx) { std::cout << "A=" << ctx.a << "\n"; };
150 Context1 my_ctx;
151 my_ctx.a = 3;
152 k(my_ctx);
153}
154
155extern "C" int arcaneTestHip1()
156{
157 constexpr int vsize = 2000;
158 std::vector<double> a(vsize);
159 std::vector<double> b(vsize);
160 std::vector<double> out(vsize);
161 for (size_t i = 0; i < vsize; ++i) {
162 a[i] = (double)(i + 1);
163 b[i] = (double)(i * i + 1);
164 out[i] = 0.0; //a[i] + b[i];
165 }
166 size_t mem_size = vsize * sizeof(double);
167 double* d_a = nullptr;
168 ARCANE_CHECK_HIP(hipMalloc(&d_a, mem_size));
169 double* d_b = nullptr;
170 ARCANE_CHECK_HIP(hipMalloc(&d_b, mem_size));
171 double* d_out = nullptr;
172 ARCANE_CHECK_HIP(hipMalloc(&d_out, mem_size));
173
174 ARCANE_CHECK_HIP(hipMemcpy(d_a, a.data(), mem_size, hipMemcpyHostToDevice));
175 ARCANE_CHECK_HIP(hipMemcpy(d_b, b.data(), mem_size, hipMemcpyHostToDevice));
176 int threadsPerBlock = 256;
177 int blocksPerGrid = (vsize + threadsPerBlock - 1) / threadsPerBlock;
178 std::cout << "CALLING kernel tpb=" << threadsPerBlock << " bpg=" << blocksPerGrid << "\n";
179 hipLaunchKernelGGL(MyVecAdd, blocksPerGrid, threadsPerBlock, 0, 0, d_a, d_b, d_out);
180 ARCANE_CHECK_HIP(hipDeviceSynchronize());
181 ARCANE_CHECK_HIP(hipMemcpy(out.data(), d_out, mem_size, hipMemcpyDeviceToHost));
182 for (size_t i = 0; i < 10; ++i)
183 std::cout << "V=" << out[i] << "\n";
184 return 0;
185}
186
187extern "C" int arcaneTestHip2()
188{
189 MyTestFunc1();
190 constexpr int vsize = 2000;
191 size_t mem_size = vsize * sizeof(double);
192 double* d_a = nullptr;
193 ARCANE_CHECK_HIP(hipMallocManaged(&d_a, mem_size, hipMemAttachGlobal));
194 double* d_b = nullptr;
195 ARCANE_CHECK_HIP(hipMallocManaged(&d_b, mem_size, hipMemAttachGlobal));
196 double* d_out = nullptr;
197 ARCANE_CHECK_HIP(hipMallocManaged(&d_out, mem_size, hipMemAttachGlobal));
198
199 //d_a = new double[vsize];
200 //d_b = new double[vsize];
201 //d_out = new double[vsize];
202
203 for (size_t i = 0; i < vsize; ++i) {
204 d_a[i] = (double)(i + 1);
205 d_b[i] = (double)(i * i + 1);
206 d_out[i] = 0.0; //a[i] + b[i];
207 }
208
209 //hipMemcpy(d_a, a.data(), mem_size, hipMemcpyHostToDevice);
210 //hipMemcpy(d_b, b.data(), mem_size, hipMemcpyHostToDevice);
211 int threadsPerBlock = 256;
212 int blocksPerGrid = (vsize + threadsPerBlock - 1) / threadsPerBlock;
213 std::cout << "CALLING kernel2 tpb=" << threadsPerBlock << " bpg=" << blocksPerGrid << "\n";
214 hipLaunchKernelGGL(MyVecAdd, blocksPerGrid, threadsPerBlock, 0, 0, d_a, d_b, d_out);
215 ARCANE_CHECK_HIP(hipDeviceSynchronize());
216 hipError_t e = hipGetLastError();
217 std::cout << "END OF MYVEC1 e=" << e << " v=" << hipGetErrorString(e) << "\n";
218 //hipDeviceSynchronize();
219 //e = hipGetLastError();
220 //std::cout << "END OF MYVEC2 e=" << e << " v=" << hipGetErrorString(e) << "\n";
221 //hipMemcpy(out.data(), d_out, mem_size, hipMemcpyDeviceToHost);
222 //e = hipGetLastError();
223 //std::cout << "END OF MYVEC3 e=" << e << " v=" << hipGetErrorString(e) << "\n";
224 for (size_t i = 0; i < 10; ++i)
225 std::cout << "V=" << d_out[i] << "\n";
226
227 return 0;
228}
229
230/*---------------------------------------------------------------------------*/
231/*---------------------------------------------------------------------------*/
232
233extern "C" int arcaneTestHip3()
234{
235 std::cout << "TEST_HIP_3\n";
236 constexpr int vsize = 2000;
239 if (!hip_allocator2)
240 ARCANE_FATAL("platform::getAcceleratorHostMemoryAllocator() is null");
241 UniqueArray<double> d_a(hip_allocator, vsize);
242 MyTestFunc1();
243 UniqueArray<double> d_b(hip_allocator, vsize);
244 UniqueArray<double> d_out(hip_allocator, vsize);
245
246 for (size_t i = 0; i < vsize; ++i) {
247 d_a[i] = (double)(i + 1);
248 d_b[i] = (double)(i * i + 1);
249 d_out[i] = 0.0; //a[i] + b[i];
250 }
251
252 int threadsPerBlock = 256;
253 int blocksPerGrid = (vsize + threadsPerBlock - 1) / threadsPerBlock;
254 std::cout << "CALLING kernel2 tpb=" << threadsPerBlock << " bpg=" << blocksPerGrid << "\n";
255 hipLaunchKernelGGL(MyVecAdd2, blocksPerGrid, threadsPerBlock, 0, 0, d_a, d_b, d_out);
256 ARCANE_CHECK_HIP(hipDeviceSynchronize());
257 hipError_t e = hipGetLastError();
258 std::cout << "END OF MYVEC1 e=" << e << " v=" << hipGetErrorString(e) << "\n";
259 for (size_t i = 0; i < 10; ++i)
260 std::cout << "V=" << d_out[i] << "\n";
261
262 // Launch a kernel dynamically
263 {
264 _initArrays(d_a, d_b, d_out, 2);
265
266 dim3 dimGrid(threadsPerBlock, 1, 1), dimBlock(blocksPerGrid, 1, 1);
267
268 Span<const double> d_a_span = d_a.span();
269 Span<const double> d_b_span = d_b.span();
270 Span<double> d_out_view = d_out.span();
271
272 void* kernelArgs[] = {
273 (void*)&d_a_span,
274 (void*)&d_b_span,
275 (void*)&d_out_view
276 };
277 size_t smemSize = 0;
278 hipStream_t stream;
279 ARCANE_CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
280 ARCANE_CHECK_HIP(hipLaunchKernel((void*)MyVecAdd2, dimGrid, dimBlock, kernelArgs, smemSize, stream));
281 ARCANE_CHECK_HIP(hipStreamSynchronize(stream));
282 for (size_t i = 0; i < 10; ++i)
283 std::cout << "V2=" << d_out[i] << "\n";
284 }
285
286 // Launch a lambda
287 {
288 _initArrays(d_a, d_b, d_out, 3);
289 Span<const double> d_a_span = d_a.span();
290 Span<const double> d_b_span = d_b.span();
291 Span<double> d_out_span = d_out.span();
292 auto func = [=] ARCCORE_HOST_DEVICE(int i) {
293 d_out_span[i] = d_a_span[i] + d_b_span[i];
294 if (i<10){
295 //printf("A=%d %lf %lf %lf\n",i,d_a_span[i],d_b_span[i],d_out_span[i]);
296 } };
297
298 hipLaunchKernelGGL(MyVecLambda, blocksPerGrid, threadsPerBlock, 0, 0, vsize, func);
299 ARCANE_CHECK_HIP(hipDeviceSynchronize());
300 for (size_t i = 0; i < 10; ++i)
301 std::cout << "V3=" << d_out[i] << "\n";
302
303 _initArrays(d_a, d_b, d_out, 4);
304
305 // Calls the 'host' version of the lambda
306 for (int i = 0; i < vsize; ++i)
307 func(i);
308 for (size_t i = 0; i < 10; ++i)
309 std::cout << "V4=" << d_out[i] << "\n";
310 }
311
312 // Use Real3
313 {
314 UniqueArray<Real3> d_a3(hip_allocator, vsize);
315 UniqueArray<Real3> d_b3(hip_allocator, vsize);
316 for (Integer i = 0; i < vsize; ++i) {
317 Real a = (Real)(i + 2);
318 Real b = (Real)(i * i + 3);
319
320 d_a3[i] = Real3(a, a + 1.0, a + 2.0);
321 d_b3[i] = Real3(b, b + 2.0, b + 3.0);
322 }
323
324 Span<const Real3> d_a3_span = d_a3.span();
325 Span<const Real3> d_b3_span = d_b3.span();
326 Span<double> d_out_span = d_out.span();
327 auto func2 = [=] ARCCORE_HOST_DEVICE(int i) {
328 d_out_span[i] = math::dot(d_a3_span[i], d_b3_span[i]);
329 if (i < 10) {
330 //printf("DOT=%d %lf\n",i,d_out_span[i]);
331 }
332 };
333 hipLaunchKernelGGL(MyVecLambda, blocksPerGrid, threadsPerBlock, 0, 0, vsize, func2);
334 ARCANE_CHECK_HIP(hipDeviceSynchronize());
335 std::cout << "TEST WITH REAL3\n";
336 }
337
338 return 0;
339}
340
341/*---------------------------------------------------------------------------*/
342/*---------------------------------------------------------------------------*/
343
344extern "C" int arcaneTestHipNumArray()
345{
346 std::cout << "TEST_HIP_NUM_ARRAY\n";
347 constexpr int vsize = 2000;
348 //IMemoryAllocator* hip_allocator = Arcane::Accelerator::Hip::getHipMemoryAllocator();
350 if (!hip_allocator2)
351 ARCANE_FATAL("platform::getAcceleratorHostMemoryAllocator() is null");
353 MyTestFunc1();
356 d_a.resize(vsize);
357 d_b.resize(vsize);
358 d_out.resize(vsize);
359 for (int i = 0; i < vsize; ++i) {
360 d_a(i) = (double)(i + 1);
361 d_b(i) = (double)(i * i + 1);
362 d_out(i) = 0.0; //a[i] + b[i];
363 }
364
365 int threadsPerBlock = 256;
366 int blocksPerGrid = (vsize + threadsPerBlock - 1) / threadsPerBlock;
367 std::cout << "CALLING kernel2 tpb=" << threadsPerBlock << " bpg=" << blocksPerGrid << "\n";
368 hipLaunchKernelGGL(MyVecAdd3, blocksPerGrid, threadsPerBlock, 0, 0, d_a, d_b, d_out);
369 ARCANE_CHECK_HIP(hipDeviceSynchronize());
370 hipError_t e = hipGetLastError();
371 std::cout << "END OF MYVEC1 e=" << e << " v=" << hipGetErrorString(e) << "\n";
372 for (int i = 0; i < 10; ++i)
373 std::cout << "V=" << d_out(i) << "\n";
374
375 // Launch a kernel dynamically
376 {
377 _initArrays(d_a, d_b, d_out, 2);
378
379 dim3 dimGrid(threadsPerBlock, 1, 1), dimBlock(blocksPerGrid, 1, 1);
380
383 MDSpan<double, MDDim1> d_out_view = d_out.mdspan();
384
385 void* kernelArgs[] = {
386 (void*)&d_a_span,
387 (void*)&d_b_span,
388 (void*)&d_out_view
389 };
390 size_t smemSize = 0;
391 hipStream_t stream;
392 ARCANE_CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
393 ARCANE_CHECK_HIP(hipLaunchKernel((void*)MyVecAdd2, dimGrid, dimBlock, kernelArgs, smemSize, stream));
394 ARCANE_CHECK_HIP(hipStreamSynchronize(stream));
395 for (int i = 0; i < 10; ++i)
396 std::cout << "V2=" << d_out(i) << "\n";
397 }
398
399 // Launch a lambda
400 {
401 _initArrays(d_a, d_b, d_out, 3);
404 MDSpan<double, MDDim1> d_out_span = d_out.mdspan();
405 auto func = [=] ARCCORE_HOST_DEVICE(int i) {
406 d_out_span(i) = d_a_span(i) + d_b_span(i);
407 if (i<10){
408 //printf("A=%d %lf %lf %lf\n",i,d_a_span(i),d_b_span(i),d_out_span(i));
409 } };
410
411 hipLaunchKernelGGL(MyVecLambda, blocksPerGrid, threadsPerBlock, 0, 0, vsize, func);
412 ARCANE_CHECK_HIP(hipDeviceSynchronize());
413 for (int i = 0; i < 10; ++i)
414 std::cout << "V3=" << d_out(i) << "\n";
415
416 _initArrays(d_a, d_b, d_out, 4);
417
418 // Calls the 'host' version of the lambda
419 for (int i = 0; i < vsize; ++i)
420 func(i);
421 for (int i = 0; i < 10; ++i)
422 std::cout << "V4=" << d_out(i) << "\n";
423 }
424
425 // Uses Real3 with a multi-dimensional array
426 {
427 NumArray<Real, MDDim2> d_a3(vsize, 3);
428 NumArray<Real, MDDim2> d_b3(vsize, 3);
429 for (Integer i = 0; i < vsize; ++i) {
430 Real a = (Real)(i + 2);
431 Real b = (Real)(i * i + 3);
432
433 d_a3(i, 0) = a;
434 d_a3(i, 1) = a + 1.0;
435 d_a3(i, 2) = a + 2.0;
436
437 d_b3(i, 0) = b;
438 d_b3(i, 1) = b + 1.0;
439 d_b3(i, 2) = b + 2.0;
440 }
441
442 MDSpan<const Real, MDDim2> d_a3_span = d_a3.constMDSpan();
443 MDSpan<const Real, MDDim2> d_b3_span = d_b3.constMDSpan();
444 MDSpan<double, MDDim1> d_out_span = d_out.mdspan();
445 auto func2 = [=] ARCCORE_HOST_DEVICE(int i) {
446 Real3 xa(d_a3_span(i, 0), d_a3_span(i, 1), d_a3_span(i, 2));
447 Real3 xb(d_b3_span(i, 0), d_b3_span(i, 1), d_b3_span(i, 2));
448 d_out_span(i) = math::dot(xa, xb);
449 if (i < 10) {
450 //printf("DOT NUMARRAY=%d %lf\n",i,d_out_span(i));
451 }
452 };
453 hipLaunchKernelGGL(MyVecLambda, blocksPerGrid, threadsPerBlock, 0, 0, vsize, func2);
454 ARCANE_CHECK_HIP(hipDeviceSynchronize());
455 std::cout << "TEST WITH REAL3\n";
456 }
457
458 return 0;
459}
460
461/*---------------------------------------------------------------------------*/
462/*---------------------------------------------------------------------------*/
463
464#include "arcane/accelerator/Reduce.h"
465
466namespace ax = Arcane::Accelerator;
467
468void arcaneTestHipReductionX(int vsize, ax::RunQueue& queue, const String& name)
469{
470 using namespace Arcane::Accelerator;
471 std::cout << "TestReduction vsize=" << vsize << "\n";
473 UniqueArray<int> d_a(hip_allocator2, vsize);
474 UniqueArray<int> d_out(hip_allocator2, vsize);
475
476 for (Integer i = 0; i < vsize; ++i) {
477 int a = 5 + ((i + 2) % 43);
478 d_a[i] = a;
479 d_out[i] = 0;
480 //std::cout << "I=" << i << " a=" << a << "\n";
481 }
482 RunCommand command = makeCommand(queue);
483 ReducerSum<int> sum_reducer(command);
484 ReducerSum<double> sum_double_reducer(command);
485 ReducerMax<int> max_int_reducer(command);
486 ReducerMax<double> max_double_reducer(command);
487 ReducerMin<int> min_int_reducer(command);
488 ReducerMin<double> min_double_reducer(command);
489 Span<const int> xa = d_a.span();
490 Span<int> xout = d_out.span();
491 command << RUNCOMMAND_LOOP1(idx, vsize)
492 {
493 auto [i] = idx();
494 double vxa = (double)(xa[i]);
495 xout[i] = xa[i];
496 sum_reducer.add(xa[i]);
497 sum_double_reducer.add(vxa);
498 max_int_reducer.max(xa[i]);
499 max_double_reducer.max(vxa);
500 min_int_reducer.min(xa[i]);
501 min_double_reducer.min(vxa);
502 //if (i<10)
503 //printf("Do Reduce i=%d v=%d %lf\n",i,xa[i],vxa);
504 };
505
506 int sum_int_value = sum_reducer.reduce();
507 double sum_double_value = sum_double_reducer.reduce();
508 std::cout << "SumReducer name=" << name << " v_int=" << sum_int_value
509 << " v_double=" << sum_double_value
510 << "\n";
511 int max_int_value = max_int_reducer.reduce();
512 double max_double_value = max_double_reducer.reduce();
513 std::cout << "MaxReducer name=" << name << " v_int=" << max_int_value
514 << " v_double=" << max_double_value
515 << "\n";
516 int min_int_value = min_int_reducer.reduce();
517 double min_double_value = min_double_reducer.reduce();
518 std::cout << "MinReducer name=" << name << " v_int=" << min_int_value
519 << " v_double=" << min_double_value
520 << "\n";
521}
522
523#include <stdio.h>
524
525__device__ int my_add(int a, int b)
526{
527 return a + b;
528}
529
530__device__ int mul(int a, int b)
531{
532 return a * b;
533}
534
535// Pointeur de fonction sur le device
536//__device__ int (*op)(int, int) = &add;
537
538__global__ void compute(int* d_result, int N, int (*op_func)(int, int))
539{
540 int idx = threadIdx.x + blockIdx.x * blockDim.x;
541 //if (idx == 0)
542 //printf("MyFuncDevice=%p\n",op_func);
543
544 if (idx < N) {
545 d_result[idx] = op_func(idx, idx);
546 }
547}
548using BinaryFuncType = int (*)(int a, int b);
549
550__device__ int (*my_func_ptr)(int a, int b) = my_add;
551
552__global__ void kernelSetFunction(BinaryFuncType* func_ptr)
553{
554 *func_ptr = my_add;
555 //printf("MyAddDevice=%p\n",my_add);
556}
557
559{
560 static __device__ int doFunc(int a, int b)
561 {
562 return a + b;
563 }
564};
565
567{
568 public:
569
570 //virtual ARCCORE_HOST_DEVICE ~FooBase() {}
571 virtual ARCCORE_HOST_DEVICE int apply(int a, int b) = 0;
572};
573
575: public FooBase
576{
577 public:
578
579 ARCCORE_HOST_DEVICE int apply(int a, int b) override { return a + b; }
580};
581
582__global__ void compute_virtual(int* d_result, int N, FooBase* ptr)
583{
584 //FooBase* ptr = nullptr;
585 //FooDerived my_foo;
586 //ptr = &my_foo;
587 int idx = threadIdx.x + blockIdx.x * blockDim.x;
588 //if (idx == 0)
589 //printf("MyFuncDevice=%p\n",op_func);
590
591 if (idx < N) {
592 d_result[idx] = ptr->apply(idx, idx);
593 }
594}
595
596__global__ void createFooDerived(FooDerived* ptr)
597{
598 int idx = threadIdx.x + blockIdx.x * blockDim.x;
599 if (idx == 0) {
600 new (ptr) FooDerived();
601 }
602}
603
604extern "C" int arcaneTestVirtualFunction()
605{
606 std::cout << "Test function pointer\n";
607 //std::cout << "FuncPtr direct=" << my_func_ptr << "\n";
608 std::cout.flush();
609
610 const int N = 10;
611 int h_result[N];
612 int* d_result;
613 FooDerived* foo_derived = nullptr;
614 ARCANE_CHECK_HIP(hipMalloc(&foo_derived, sizeof(FooDerived)));
615 createFooDerived<<<1, 1>>>(foo_derived);
616 ARCANE_CHECK_HIP(hipDeviceSynchronize());
617
618 ARCANE_CHECK_HIP(hipMalloc(&d_result, N * sizeof(int)));
619
620 int (*host_func)(int, int) = nullptr;
621 ARCANE_CHECK_HIP(hipMalloc(&host_func, sizeof(void*) * 8));
622
623 //my_func_ptr = my_add;
624 //cudaMemcpyFromSymbol(&host_func, my_func_ptr, sizeof(void*));
625 std::cout << "Set function pointer\n";
626 //kernelSetFunction<<<1, 1>>>(&host_func);
627
628 std::cout << "Wait end\n";
629 std::cout.flush();
630 ARCANE_CHECK_HIP(hipDeviceSynchronize());
631
632 std::cout << "Calling compute\n";
633 std::cout.flush();
634
635 // Call the kernel
636 //compute<<<1, N>>>(d_result, N, host_func);
637 //compute<<<1, N>>>(d_result, N, my_func_ptr);
638 compute_virtual<<<1, N>>>(d_result, N, foo_derived);
639 ARCANE_CHECK_HIP(hipDeviceSynchronize());
640
641 //compute<<<1, N>>>(d_result, N, my_func_ptr);
642 ARCANE_CHECK_HIP(hipMemcpy(h_result, d_result, N * sizeof(int), hipMemcpyDeviceToHost));
643
644 for (int i = 0; i < N; ++i) {
645 printf("%d ", h_result[i]);
646 }
647 printf("\n");
648
649 ARCANE_CHECK_HIP(hipFree(d_result));
650 return 0;
651}
652
653/*---------------------------------------------------------------------------*/
654/*---------------------------------------------------------------------------*/
655
656extern "C" int arcaneTestHipReduction()
657{
658 // TODO: tester does not start from 0.
659 std::cout << "Test Reductions\n";
663 ax::RunQueue queue1{ makeQueue(runner_seq) };
664 ax::RunQueue queue2{ makeQueue(runner_thread) };
665 ax::RunQueue queue3{ makeQueue(runner_hip) };
666 int sizes_to_test[] = { 56, 567, 4389, 452182 };
667 for (int i = 0; i < 4; ++i) {
668 int vsize = sizes_to_test[i];
669 arcaneTestHipReductionX(vsize, queue1, "Sequential");
670 arcaneTestHipReductionX(vsize, queue2, "Thread");
671 arcaneTestHipReductionX(vsize, queue3, "HIP");
672 }
673 return 0;
674}
#define ARCANE_FATAL(...)
Macro throwing a FatalErrorException.
Various mathematical functions.
Memory and allocator management functions.
#define RUNCOMMAND_LOOP1(iter_name, x1,...)
1D loop on accelerator with additional arguments.
Base class for multi-dimensional views.
Multi-dimensional arrays for numerical types accessible on accelerators.
MDSpanType span()
Multi-dimensional view on the instance.
MDSpanType mdspan()
Multi-dimensional view on the instance.
ConstMDSpanType constMDSpan() const
Constant multi-dimensional view on the instance.
void resize(Int32 dim1_size)
Resizes the array without keeping current values.
Class managing a 3-dimensional real vector.
Definition Real3.h:132
View of an array of elements of type T.
Definition Span.h:635
1D data vector with value semantics (STL style).
__host__ __device__ Real dot(Real2 u, Real2 v)
Dot product of u by v in .
Definition MathUtils.h:94
RunCommand makeCommand(const RunQueue &run_queue)
Creates a command associated with the queue run_queue.
RunQueue makeQueue(const Runner &runner)
Creates a queue associated with runner.
@ HIP
Execution policy using the HIP environment.
@ Thread
Multi-threaded execution policy.
IMemoryAllocator * getDefaultDataAllocator()
Default allocator for data.
IMemoryAllocator * getAllocator(eMemoryResource mem_resource)
Default allocator for the resource mem_resource.
-- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature --
Int32 Integer
Type representing an integer.
@ UnifiedMemory
Allocates using unified memory.
Namespace of Arccore.