17#include <hip/hip_runtime.h>
19#include "arcane/utils/PlatformUtils.h"
20#include "arcane/utils/NotSupportedException.h"
21#include "arcane/utils/Real3.h"
24#include "arcane/core/Item.h"
27#include "arccore/accelerator_native/HipAccelerator.h"
28#include "arcane/accelerator/Runner.h"
29#include "arcane/accelerator/RunQueue.h"
30#include "arcane/accelerator/RunCommandLoop.h"
31#include "arcane/accelerator/NumArray.h"
37__device__ __forceinline__
unsigned int getGlobalIdx_1D_1D()
39 unsigned int blockId = blockIdx.x;
40 unsigned int threadId = blockId * blockDim.x + threadIdx.x;
48 using reference_type = value_type&;
51 ARCCORE_HOST_DEVICE Privatizer(
const T& o)
54 ARCCORE_HOST_DEVICE reference_type get_priv() {
return priv; }
58ARCCORE_HOST_DEVICE
auto thread_privatize(
const T& item) ->
Privatizer<T>
63__global__
void MyVecAdd(
double* a,
double* b,
double* out)
65 int i = blockDim.x * blockIdx.x + threadIdx.x;
74 Int64 size = a.size();
75 Int64 i = blockDim.x * blockIdx.x + threadIdx.x;
86 Int32 size =
static_cast<Int32
>(a.extent0());
87 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
98 Int64 vsize = a.size();
99 for (Int64 i = 0; i < vsize; ++i) {
100 a[i] = (double)(i + base);
101 b[i] = (double)(i * i + base);
108 Int32 vsize =
static_cast<Int32
>(a.extent0());
109 for (Int32 i = 0; i < vsize; ++i) {
110 a(i) = (double)(i + base);
111 b(i) = (double)(i * i + base);
116template <
typename F> __global__
void MyVecLambda(
int size, F func)
118 auto privatizer = thread_privatize(func);
119 auto& body = privatizer.get_priv();
121 int i = blockDim.x * blockIdx.x + threadIdx.x;
130 virtual __device__ __host__
void DoIt2() = 0;
139 virtual __device__ __host__
void DoIt2()
override {}
149 auto k = [=](Context1& ctx) { std::cout <<
"A=" << ctx.a <<
"\n"; };
155extern "C" int arcaneTestHip1()
157 constexpr int vsize = 2000;
158 std::vector<double> a(vsize);
159 std::vector<double> b(vsize);
160 std::vector<double> out(vsize);
161 for (
size_t i = 0; i < vsize; ++i) {
162 a[i] = (double)(i + 1);
163 b[i] = (double)(i * i + 1);
166 size_t mem_size = vsize *
sizeof(double);
167 double* d_a =
nullptr;
168 ARCANE_CHECK_HIP(hipMalloc(&d_a, mem_size));
169 double* d_b =
nullptr;
170 ARCANE_CHECK_HIP(hipMalloc(&d_b, mem_size));
171 double* d_out =
nullptr;
172 ARCANE_CHECK_HIP(hipMalloc(&d_out, mem_size));
174 ARCANE_CHECK_HIP(hipMemcpy(d_a, a.data(), mem_size, hipMemcpyHostToDevice));
175 ARCANE_CHECK_HIP(hipMemcpy(d_b, b.data(), mem_size, hipMemcpyHostToDevice));
176 int threadsPerBlock = 256;
177 int blocksPerGrid = (vsize + threadsPerBlock - 1) / threadsPerBlock;
178 std::cout <<
"CALLING kernel tpb=" << threadsPerBlock <<
" bpg=" << blocksPerGrid <<
"\n";
179 hipLaunchKernelGGL(MyVecAdd, blocksPerGrid, threadsPerBlock, 0, 0, d_a, d_b, d_out);
180 ARCANE_CHECK_HIP(hipDeviceSynchronize());
181 ARCANE_CHECK_HIP(hipMemcpy(out.data(), d_out, mem_size, hipMemcpyDeviceToHost));
182 for (
size_t i = 0; i < 10; ++i)
183 std::cout <<
"V=" << out[i] <<
"\n";
187extern "C" int arcaneTestHip2()
190 constexpr int vsize = 2000;
191 size_t mem_size = vsize *
sizeof(double);
192 double* d_a =
nullptr;
193 ARCANE_CHECK_HIP(hipMallocManaged(&d_a, mem_size, hipMemAttachGlobal));
194 double* d_b =
nullptr;
195 ARCANE_CHECK_HIP(hipMallocManaged(&d_b, mem_size, hipMemAttachGlobal));
196 double* d_out =
nullptr;
197 ARCANE_CHECK_HIP(hipMallocManaged(&d_out, mem_size, hipMemAttachGlobal));
203 for (
size_t i = 0; i < vsize; ++i) {
204 d_a[i] = (double)(i + 1);
205 d_b[i] = (double)(i * i + 1);
211 int threadsPerBlock = 256;
212 int blocksPerGrid = (vsize + threadsPerBlock - 1) / threadsPerBlock;
213 std::cout <<
"CALLING kernel2 tpb=" << threadsPerBlock <<
" bpg=" << blocksPerGrid <<
"\n";
214 hipLaunchKernelGGL(MyVecAdd, blocksPerGrid, threadsPerBlock, 0, 0, d_a, d_b, d_out);
215 ARCANE_CHECK_HIP(hipDeviceSynchronize());
216 hipError_t e = hipGetLastError();
217 std::cout <<
"END OF MYVEC1 e=" << e <<
" v=" << hipGetErrorString(e) <<
"\n";
224 for (
size_t i = 0; i < 10; ++i)
225 std::cout <<
"V=" << d_out[i] <<
"\n";
233extern "C" int arcaneTestHip3()
235 std::cout <<
"TEST_HIP_3\n";
236 constexpr int vsize = 2000;
240 ARCANE_FATAL(
"platform::getAcceleratorHostMemoryAllocator() is null");
246 for (
size_t i = 0; i < vsize; ++i) {
247 d_a[i] = (double)(i + 1);
248 d_b[i] = (double)(i * i + 1);
252 int threadsPerBlock = 256;
253 int blocksPerGrid = (vsize + threadsPerBlock - 1) / threadsPerBlock;
254 std::cout <<
"CALLING kernel2 tpb=" << threadsPerBlock <<
" bpg=" << blocksPerGrid <<
"\n";
255 hipLaunchKernelGGL(MyVecAdd2, blocksPerGrid, threadsPerBlock, 0, 0, d_a, d_b, d_out);
256 ARCANE_CHECK_HIP(hipDeviceSynchronize());
257 hipError_t e = hipGetLastError();
258 std::cout <<
"END OF MYVEC1 e=" << e <<
" v=" << hipGetErrorString(e) <<
"\n";
259 for (
size_t i = 0; i < 10; ++i)
260 std::cout <<
"V=" << d_out[i] <<
"\n";
264 _initArrays(d_a, d_b, d_out, 2);
266 dim3 dimGrid(threadsPerBlock, 1, 1), dimBlock(blocksPerGrid, 1, 1);
272 void* kernelArgs[] = {
279 ARCANE_CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
280 ARCANE_CHECK_HIP(hipLaunchKernel((
void*)MyVecAdd2, dimGrid, dimBlock, kernelArgs, smemSize, stream));
281 ARCANE_CHECK_HIP(hipStreamSynchronize(stream));
282 for (
size_t i = 0; i < 10; ++i)
283 std::cout <<
"V2=" << d_out[i] <<
"\n";
288 _initArrays(d_a, d_b, d_out, 3);
292 auto func = [=] ARCCORE_HOST_DEVICE(
int i) {
293 d_out_span[i] = d_a_span[i] + d_b_span[i];
298 hipLaunchKernelGGL(MyVecLambda, blocksPerGrid, threadsPerBlock, 0, 0, vsize, func);
299 ARCANE_CHECK_HIP(hipDeviceSynchronize());
300 for (
size_t i = 0; i < 10; ++i)
301 std::cout <<
"V3=" << d_out[i] <<
"\n";
303 _initArrays(d_a, d_b, d_out, 4);
306 for (
int i = 0; i < vsize; ++i)
308 for (
size_t i = 0; i < 10; ++i)
309 std::cout <<
"V4=" << d_out[i] <<
"\n";
316 for (
Integer i = 0; i < vsize; ++i) {
317 Real a = (Real)(i + 2);
318 Real b = (Real)(i * i + 3);
320 d_a3[i] =
Real3(a, a + 1.0, a + 2.0);
321 d_b3[i] =
Real3(b, b + 2.0, b + 3.0);
327 auto func2 = [=] ARCCORE_HOST_DEVICE(
int i) {
328 d_out_span[i] =
math::dot(d_a3_span[i], d_b3_span[i]);
333 hipLaunchKernelGGL(MyVecLambda, blocksPerGrid, threadsPerBlock, 0, 0, vsize, func2);
334 ARCANE_CHECK_HIP(hipDeviceSynchronize());
335 std::cout <<
"TEST WITH REAL3\n";
344extern "C" int arcaneTestHipNumArray()
346 std::cout <<
"TEST_HIP_NUM_ARRAY\n";
347 constexpr int vsize = 2000;
351 ARCANE_FATAL(
"platform::getAcceleratorHostMemoryAllocator() is null");
359 for (
int i = 0; i < vsize; ++i) {
360 d_a(i) = (double)(i + 1);
361 d_b(i) = (double)(i * i + 1);
365 int threadsPerBlock = 256;
366 int blocksPerGrid = (vsize + threadsPerBlock - 1) / threadsPerBlock;
367 std::cout <<
"CALLING kernel2 tpb=" << threadsPerBlock <<
" bpg=" << blocksPerGrid <<
"\n";
368 hipLaunchKernelGGL(MyVecAdd3, blocksPerGrid, threadsPerBlock, 0, 0, d_a, d_b, d_out);
369 ARCANE_CHECK_HIP(hipDeviceSynchronize());
370 hipError_t e = hipGetLastError();
371 std::cout <<
"END OF MYVEC1 e=" << e <<
" v=" << hipGetErrorString(e) <<
"\n";
372 for (
int i = 0; i < 10; ++i)
373 std::cout <<
"V=" << d_out(i) <<
"\n";
377 _initArrays(d_a, d_b, d_out, 2);
379 dim3 dimGrid(threadsPerBlock, 1, 1), dimBlock(blocksPerGrid, 1, 1);
385 void* kernelArgs[] = {
392 ARCANE_CHECK_HIP(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking));
393 ARCANE_CHECK_HIP(hipLaunchKernel((
void*)MyVecAdd2, dimGrid, dimBlock, kernelArgs, smemSize, stream));
394 ARCANE_CHECK_HIP(hipStreamSynchronize(stream));
395 for (
int i = 0; i < 10; ++i)
396 std::cout <<
"V2=" << d_out(i) <<
"\n";
401 _initArrays(d_a, d_b, d_out, 3);
405 auto func = [=] ARCCORE_HOST_DEVICE(
int i) {
406 d_out_span(i) = d_a_span(i) + d_b_span(i);
411 hipLaunchKernelGGL(MyVecLambda, blocksPerGrid, threadsPerBlock, 0, 0, vsize, func);
412 ARCANE_CHECK_HIP(hipDeviceSynchronize());
413 for (
int i = 0; i < 10; ++i)
414 std::cout <<
"V3=" << d_out(i) <<
"\n";
416 _initArrays(d_a, d_b, d_out, 4);
419 for (
int i = 0; i < vsize; ++i)
421 for (
int i = 0; i < 10; ++i)
422 std::cout <<
"V4=" << d_out(i) <<
"\n";
429 for (
Integer i = 0; i < vsize; ++i) {
430 Real a = (Real)(i + 2);
431 Real b = (Real)(i * i + 3);
434 d_a3(i, 1) = a + 1.0;
435 d_a3(i, 2) = a + 2.0;
438 d_b3(i, 1) = b + 1.0;
439 d_b3(i, 2) = b + 2.0;
445 auto func2 = [=] ARCCORE_HOST_DEVICE(
int i) {
446 Real3 xa(d_a3_span(i, 0), d_a3_span(i, 1), d_a3_span(i, 2));
447 Real3 xb(d_b3_span(i, 0), d_b3_span(i, 1), d_b3_span(i, 2));
453 hipLaunchKernelGGL(MyVecLambda, blocksPerGrid, threadsPerBlock, 0, 0, vsize, func2);
454 ARCANE_CHECK_HIP(hipDeviceSynchronize());
455 std::cout <<
"TEST WITH REAL3\n";
464#include "arcane/accelerator/Reduce.h"
471 std::cout <<
"TestReduction vsize=" << vsize <<
"\n";
476 for (
Integer i = 0; i < vsize; ++i) {
477 int a = 5 + ((i + 2) % 43);
494 double vxa = (double)(xa[i]);
496 sum_reducer.add(xa[i]);
497 sum_double_reducer.add(vxa);
498 max_int_reducer.max(xa[i]);
499 max_double_reducer.max(vxa);
500 min_int_reducer.min(xa[i]);
501 min_double_reducer.min(vxa);
506 int sum_int_value = sum_reducer.reduce();
507 double sum_double_value = sum_double_reducer.reduce();
508 std::cout <<
"SumReducer name=" << name <<
" v_int=" << sum_int_value
509 <<
" v_double=" << sum_double_value
511 int max_int_value = max_int_reducer.reduce();
512 double max_double_value = max_double_reducer.reduce();
513 std::cout <<
"MaxReducer name=" << name <<
" v_int=" << max_int_value
514 <<
" v_double=" << max_double_value
516 int min_int_value = min_int_reducer.reduce();
517 double min_double_value = min_double_reducer.reduce();
518 std::cout <<
"MinReducer name=" << name <<
" v_int=" << min_int_value
519 <<
" v_double=" << min_double_value
525__device__
int my_add(
int a,
int b)
530__device__
int mul(
int a,
int b)
538__global__
void compute(
int* d_result,
int N,
int (*op_func)(
int,
int))
540 int idx = threadIdx.x + blockIdx.x * blockDim.x;
545 d_result[idx] = op_func(idx, idx);
548using BinaryFuncType = int (*)(
int a,
int b);
550__device__ int (*my_func_ptr)(
int a,
int b) = my_add;
552__global__
void kernelSetFunction(BinaryFuncType* func_ptr)
560 static __device__
int doFunc(
int a,
int b)
571 virtual ARCCORE_HOST_DEVICE
int apply(
int a,
int b) = 0;
579 ARCCORE_HOST_DEVICE
int apply(
int a,
int b)
override {
return a + b; }
582__global__
void compute_virtual(
int* d_result,
int N,
FooBase* ptr)
587 int idx = threadIdx.x + blockIdx.x * blockDim.x;
592 d_result[idx] = ptr->apply(idx, idx);
596__global__
void createFooDerived(
FooDerived* ptr)
598 int idx = threadIdx.x + blockIdx.x * blockDim.x;
604extern "C" int arcaneTestVirtualFunction()
606 std::cout <<
"Test function pointer\n";
614 ARCANE_CHECK_HIP(hipMalloc(&foo_derived,
sizeof(
FooDerived)));
615 createFooDerived<<<1, 1>>>(foo_derived);
616 ARCANE_CHECK_HIP(hipDeviceSynchronize());
618 ARCANE_CHECK_HIP(hipMalloc(&d_result, N *
sizeof(
int)));
620 int (*host_func)(int, int) =
nullptr;
621 ARCANE_CHECK_HIP(hipMalloc(&host_func,
sizeof(
void*) * 8));
625 std::cout <<
"Set function pointer\n";
628 std::cout <<
"Wait end\n";
630 ARCANE_CHECK_HIP(hipDeviceSynchronize());
632 std::cout <<
"Calling compute\n";
638 compute_virtual<<<1, N>>>(d_result, N, foo_derived);
639 ARCANE_CHECK_HIP(hipDeviceSynchronize());
642 ARCANE_CHECK_HIP(hipMemcpy(h_result, d_result, N *
sizeof(
int), hipMemcpyDeviceToHost));
644 for (
int i = 0; i < N; ++i) {
645 printf(
"%d ", h_result[i]);
649 ARCANE_CHECK_HIP(hipFree(d_result));
656extern "C" int arcaneTestHipReduction()
659 std::cout <<
"Test Reductions\n";
666 int sizes_to_test[] = { 56, 567, 4389, 452182 };
667 for (
int i = 0; i < 4; ++i) {
668 int vsize = sizes_to_test[i];
669 arcaneTestHipReductionX(vsize, queue1,
"Sequential");
670 arcaneTestHipReductionX(vsize, queue2,
"Thread");
671 arcaneTestHipReductionX(vsize, queue3,
"HIP");
#define ARCANE_FATAL(...)
Macro throwing a FatalErrorException.
Various mathematical functions.
Memory and allocator management functions.
#define RUNCOMMAND_LOOP1(iter_name, x1,...)
1D loop on accelerator with additional arguments.
Class to perform a 'max' reduction.
Class to perform a 'min' reduction.
Class to perform a 'sum' reduction.
Management of an accelerator command.
Execution queue for an accelerator.
Execution manager for accelerator.
Interface for a memory allocator.
Base class for multi-dimensional views.
Multi-dimensional arrays for numerical types accessible on accelerators.
MDSpanType span()
Multi-dimensional view on the instance.
MDSpanType mdspan()
Multi-dimensional view on the instance.
ConstMDSpanType constMDSpan() const
Constant multi-dimensional view on the instance.
void resize(Int32 dim1_size)
Resizes the array without keeping current values.
Class managing a 3-dimensional real vector.
View of an array of elements of type T.
Unicode character string.
1D data vector with value semantics (STL style).
__host__ __device__ Real dot(Real2 u, Real2 v)
Dot product of u by v in .
Namespace for accelerator usage.
RunCommand makeCommand(const RunQueue &run_queue)
Creates a command associated with the queue run_queue.
RunQueue makeQueue(const Runner &runner)
Creates a queue associated with runner.
@ HIP
Execution policy using the HIP environment.
@ Sequential
Sequential execution policy.
@ Thread
Multi-threaded execution policy.
IMemoryAllocator * getDefaultDataAllocator()
Default allocator for data.
IMemoryAllocator * getAllocator(eMemoryResource mem_resource)
Default allocator for the resource mem_resource.
-- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature --
Int32 Integer
Type representing an integer.
@ UnifiedMemory
Allocates using unified memory.