8#include <gtest/gtest.h>
10#include "arccore/base/PlatformUtils.h"
12#include "arccore/common/accelerator/Runner.h"
13#include "arccore/common/accelerator/RunQueue.h"
14#include "arccore/common/NumArray.h"
21#include "arccore/accelerator/LocalMemory.h"
34 Int32 nb_value, Int32 nb_loop)
45 by_block_partial_sum.resize(2048);
48 reduce_result.resize(1);
53 for (
int j = 0; j < nb_loop; ++j) {
56 auto partial_sum_span =
viewInOut(command, by_block_partial_sum);
57 auto out_reduce_result =
viewOut(command, reduce_result);
60 auto grid = iter.grid();
61 auto block = iter.block();
62 auto w = iter.workItem();
65 for (Int32 i : w.linearIndexes())
68 Int32 nb_block = grid.nbBlock();
69#if defined(ARCCORE_COMPILING_CUDA_OR_HIP) && defined(ARCCORE_DEVICE_CODE)
70 my_v = Arcane::Accelerator::Impl::block_reduce<Arcane::Accelerator::Impl::ReduceFunctorSum<Int64>, 32, Int64>(my_v);
72 if (w.rankInBlock() == 0) {
74 partial_sum_span[block.groupRank()] = my_v;
77 if (w.rankInBlock() == 0 && block.groupRank() == 0) {
79 for (Int32 i = 0; i < nb_block; ++i) {
80 Int64 v = partial_sum_span[i];
82 final_sum += partial_sum_span[i];
84 partial_sum_span[0] = final_sum;
85 out_reduce_result[0] = final_sum;
86#if !defined(__INTEL_LLVM_COMPILER)
92 total_x += reduce_result[0];
96 Int64 nb_byte = c.
size() *
sizeof(Int64) * nb_loop;
98 Real nb_giga_byte_second = (
static_cast<Real
>(nb_byte) / 1.0e9) / diff;
99 std::cout <<
"** TotalCooperativeLaunch=" << total_x
100 <<
" nb_value=" << nb_value
101 <<
" GB/s=" << nb_giga_byte_second <<
" time=" << diff <<
"\n";
Types and functions for managing synchronizations on accelerators.
Types and macros for managing hierarchical parallelism on accelerators.
#define RUNCOMMAND_LAUNCH(iter_name, bounds,...)
Macro to launch a command using hierarchical, possibly cooperative, parallelism.
Iteration range of a loop using cooperative hierarchical parallelism.
Execution queue for an accelerator.
eMemoryResource memoryResource() const
Memory resource used for allocations with this instance.
Multi-dimensional arrays for numerical types accessible on accelerators.
View of an array of elements of type T.
constexpr __host__ __device__ SizeType size() const noexcept
Returns the size of the array.
Namespace for accelerator usage.
RunCommand makeCommand(const RunQueue &run_queue)
Creates a command associated with the queue run_queue.
auto viewInOut(const ViewBuildInfo &vbi, CellMaterialVariableScalarRef< DataType > &var)
Read/write view for scalar material variables.
auto viewOut(const ViewBuildInfo &vbi, CellMaterialVariableScalarRef< DataType > &var)
Write view for scalar material variables.
-- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature --
@ HostPinned
Allocates on the host.