Arcane  4.1.12.0
Developer documentation
Loading...
Searching...
No Matches
TestCooperativeLaunch_Accelerator2.cc
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7
8#include <gtest/gtest.h>
9
10#include "arccore/base/PlatformUtils.h"
11
12#include "arccore/common/accelerator/Runner.h"
13#include "arccore/common/accelerator/RunQueue.h"
14#include "arccore/common/NumArray.h"
15
17
19
21#include "arccore/accelerator/LocalMemory.h"
22
23/*---------------------------------------------------------------------------*/
24/*---------------------------------------------------------------------------*/
25
26using namespace Arcane;
27using namespace Arcane::Accelerator;
28
29/*---------------------------------------------------------------------------*/
30/*---------------------------------------------------------------------------*/
31
32extern "C++" Int64
33_testCooperativeLaunch2(RunQueue queue, SmallSpan<const Int64> c,
34 Int32 nb_value, Int32 nb_loop)
35{
36 Int64 total_x = 0;
37 // Partial values per block.
38 // Must be sized to the maximum number of possible blocks
39 NumArray<Int64, MDDim1> by_block_partial_sum(queue.memoryResource());
40 by_block_partial_sum.resize(2048);
41 // To retrieve the reduction result.
43 reduce_result.resize(1);
44 double x = Platform::getRealTime();
45 {
46 //nb_loop = 1;
47 //nb_value = 100000;
48 SmallSpan<const Int64> c_view(c);
49 for (int j = 0; j < nb_loop; ++j) {
50 auto command = makeCommand(queue);
51 CooperativeWorkGroupLoopRange loop_range(nb_value);
52 loop_range.setBlockSize(command);
53 Int32 local_memory_size = (queue.isAcceleratorPolicy()) ? loop_range.blockSize() : 1;
54 LocalMemory<Int64> block_partial_sum(command, local_memory_size);
55 auto partial_sum_span = viewInOut(command, by_block_partial_sum);
56 auto out_reduce_result = viewOut(command, reduce_result);
57 command << RUNCOMMAND_LAUNCH(iter, loop_range, block_partial_sum)
58 {
59 auto grid = iter.grid();
60 auto block = iter.block();
61 auto w = iter.workItem();
62 auto block_partial_sum_span = block_partial_sum.span();
63
64 // Each WorkItem calculates the reduction for the indices it processes
65 // and stores the result in local memory.
66 Int64 my_v = 0;
67 for (Int32 i : w.linearIndexes())
68 my_v += c_view[i];
69 block_partial_sum_span[w.rankInBlock()] = my_v;
70 // Wait until all WorkItems in the block are finished
71 block.barrier();
72 // The first WorkItem in the block performs the reduction
73 // on the values of the local array.
74 if (w.rankInBlock() == 0) {
75 Int32 nb_local = block_partial_sum_span.size();
76 Int64 block_v = 0;
77 for (Int32 i = 0; i < nb_local; ++i)
78 block_v += block_partial_sum_span[i];
79 partial_sum_span[block.groupRank()] = block_v;
80 }
81 // Wait until the entire grid is finished.
82 grid.barrier();
83 // The first WorkItem performs the final reduction
84 if (w.rankInBlock() == 0 && block.groupRank() == 0) {
85 Int64 final_sum = 0;
86 Int32 nb_block = grid.nbBlock();
87 for (Int32 i = 0; i < nb_block; ++i) {
88 final_sum += partial_sum_span[i];
89 }
90 out_reduce_result[0] = final_sum;
91 }
92 };
93 total_x += reduce_result[0];
94 }
95 }
96 double y = Platform::getRealTime();
97 Int64 nb_byte = c.size() * sizeof(Int64) * nb_loop;
98 Real diff = y - x;
99 Real nb_giga_byte_second = (static_cast<Real>(nb_byte) / 1.0e9) / diff;
100 std::cout << "** TotalCooperativeLaunch2=" << total_x
101 << " nb_value=" << nb_value
102 << " GB/s=" << nb_giga_byte_second << " time=" << diff << "\n";
103 return total_x;
104}
105
106/*---------------------------------------------------------------------------*/
107/*---------------------------------------------------------------------------*/
Types and functions for managing synchronizations on accelerators.
Types and macros for managing hierarchical parallelism on accelerators.
#define RUNCOMMAND_LAUNCH(iter_name, bounds,...)
Macro to launch a command using hierarchical, possibly cooperative, parallelism.
Iteration range of a loop using cooperative hierarchical parallelism.
eMemoryResource memoryResource() const
Memory resource used for allocations with this instance.
Definition RunQueue.cc:372
bool isAcceleratorPolicy() const
Indicates if the instance is associated with an accelerator.
Definition RunQueue.cc:331
Multi-dimensional arrays for numerical types accessible on accelerators.
View of an array of elements of type T.
Definition Span.h:805
constexpr __host__ __device__ SizeType size() const noexcept
Returns the size of the array.
Definition Span.h:327
RunCommand makeCommand(const RunQueue &run_queue)
Creates a command associated with the queue run_queue.
auto viewInOut(const ViewBuildInfo &vbi, CellMaterialVariableScalarRef< DataType > &var)
Read/write view for scalar material variables.
auto viewOut(const ViewBuildInfo &vbi, CellMaterialVariableScalarRef< DataType > &var)
Write view for scalar material variables.
Real getRealTime()
Real time used in seconds.
-- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature --
@ HostPinned
Allocates on the host.