84 RunQueue queue = s.m_queue;
85 eExecutionPolicy exec_policy = queue.executionPolicy();
86 RunCommand command = makeCommand(queue);
87 command << trace_info;
88 Impl::RunCommandLaunchInfo launch_info(command, nb_item);
89 launch_info.beginExecute();
90 switch (exec_policy) {
91#if defined(ARCANE_COMPILING_CUDA)
92 case eExecutionPolicy::CUDA: {
93 size_t temp_storage_size = 0;
94 cudaStream_t stream = Impl::CudaUtils::toNativeStream(&queue);
96 int* nb_list1_ptr =
nullptr;
97 ARCANE_CHECK_CUDA(::cub::DevicePartition::If(
nullptr, temp_storage_size,
98 input_iter, output_iter, nb_list1_ptr, nb_item,
99 select_lambda, stream));
101 s.m_algo_storage.allocate(temp_storage_size);
102 nb_list1_ptr = s.m_device_nb_list1_storage.allocate();
103 ARCANE_CHECK_CUDA(::cub::DevicePartition::If(s.m_algo_storage.address(), temp_storage_size,
104 input_iter, output_iter, nb_list1_ptr, nb_item,
105 select_lambda, stream));
106 s.m_device_nb_list1_storage.
copyToAsync(s.m_host_nb_list1_storage, queue);
109#if defined(ARCANE_COMPILING_HIP)
110 case eExecutionPolicy::HIP: {
111 size_t temp_storage_size = 0;
113 hipStream_t stream = Impl::HipUtils::toNativeStream(&queue);
114 int* nb_list1_ptr =
nullptr;
115 ARCANE_CHECK_HIP(rocprim::partition(
nullptr, temp_storage_size, input_iter, output_iter,
116 nb_list1_ptr, nb_item, select_lambda, stream));
118 s.m_algo_storage.allocate(temp_storage_size);
119 nb_list1_ptr = s.m_device_nb_list1_storage.allocate();
121 ARCANE_CHECK_HIP(rocprim::partition(s.m_algo_storage.address(), temp_storage_size, input_iter, output_iter,
122 nb_list1_ptr, nb_item, select_lambda, stream));
123 s.m_device_nb_list1_storage.
copyToAsync(s.m_host_nb_list1_storage, queue);
126#if defined(ARCANE_COMPILING_SYCL)
127 case eExecutionPolicy::SYCL: {
128#if defined(ARCANE_HAS_ONEDPL)
139 using InputDataType =
typename InputIterator::value_type;
140 using DataType =
typename OutputIterator::value_type;
143 auto tmp_output = tmp_output_numarray.to1DSmallSpan();
144 auto tmp_select = tmp_select_numarray.to1DSmallSpan();
146 auto command = makeCommand(queue);
150 tmp_output[i] = input_iter[i];
153 auto tmp_select_lambda = [=](
Int32 i) {
return tmp_select[i]; };
154 sycl::queue sycl_queue = Impl::SyclUtils::toNativeStream(queue);
155 auto policy = oneapi::dpl::execution::make_device_policy(sycl_queue);
156 auto output_after = oneapi::dpl::stable_partition(policy, tmp_output.begin(), tmp_output.end(), select_lambda);
158 Int32 nb_list1 = (output_after - tmp_output.begin());
159 Int32 nb_list2 = nb_item - nb_list1;
160 s.m_host_nb_list1_storage[0] = nb_list1;
168 auto command = makeCommand(queue);
169 Int32 nb_iter2 = (nb_list2 / 2) + (nb_list2 % 2);
176 Int32 j = i - nb_list1;
177 Int32 reverse_i = (nb_item - (j + 1));
178 auto x1 = tmp_output[i];
179 auto x2 = tmp_output[reverse_i];
180 output_iter[i] = tmp_output[reverse_i];
181 output_iter[reverse_i] = tmp_output[i];
184 output_iter[i] = tmp_output[i];
193 case eExecutionPolicy::Thread:
196 case eExecutionPolicy::Sequential: {
197 auto saved_output_iter = output_iter;
198 auto output2_iter = output_iter + nb_item;
199 for (
Int32 i = 0; i < nb_item; ++i) {
200 auto v = *input_iter;
201 if (select_lambda(v)) {
211 Int32 nb_list1 =
static_cast<Int32>(output_iter - saved_output_iter);
212 s.m_host_nb_list1_storage[0] = nb_list1;
217 launch_info.endExecute();
227 InputIterator input_iter,
228 FirstOutputIterator first_output_iter,
229 SecondOutputIterator second_output_iter,
230 UnselectedIterator unselected_iter,
231 const Select1Lambda& select1_lambda,
232 const Select2Lambda& select2_lambda,
235 RunQueue queue = s.m_queue;
236 eExecutionPolicy exec_policy = queue.executionPolicy();
237 RunCommand command = makeCommand(queue);
238 command << trace_info;
239 Impl::RunCommandLaunchInfo launch_info(command, nb_item);
240 launch_info.beginExecute();
241 switch (exec_policy) {
242#if defined(ARCANE_COMPILING_CUDA)
243 case eExecutionPolicy::CUDA: {
244 size_t temp_storage_size = 0;
245 cudaStream_t stream = Impl::CudaUtils::toNativeStream(&queue);
247 int* nb_list1_ptr =
nullptr;
248 ARCANE_CHECK_CUDA(::cub::DevicePartition::If(
nullptr, temp_storage_size,
249 input_iter, first_output_iter, second_output_iter,
250 unselected_iter, nb_list1_ptr, nb_item,
251 select1_lambda, select2_lambda, stream));
253 s.m_algo_storage.allocate(temp_storage_size);
254 nb_list1_ptr = s.m_device_nb_list1_storage.allocate();
255 ARCANE_CHECK_CUDA(::cub::DevicePartition::If(s.m_algo_storage.address(), temp_storage_size,
256 input_iter, first_output_iter, second_output_iter,
257 unselected_iter, nb_list1_ptr, nb_item,
258 select1_lambda, select2_lambda, stream));
259 s.m_device_nb_list1_storage.
copyToAsync(s.m_host_nb_list1_storage, queue);
262#if defined(ARCANE_COMPILING_HIP)
263 case eExecutionPolicy::HIP: {
264 size_t temp_storage_size = 0;
266 hipStream_t stream = Impl::HipUtils::toNativeStream(&queue);
267 int* nb_list1_ptr =
nullptr;
268 using namespace rocprim;
269 ARCANE_CHECK_HIP(::rocprim::partition_three_way(
nullptr, temp_storage_size, input_iter, first_output_iter,
270 second_output_iter, unselected_iter,
271 nb_list1_ptr, nb_item, select1_lambda, select2_lambda, stream));
273 s.m_algo_storage.allocate(temp_storage_size);
274 nb_list1_ptr = s.m_device_nb_list1_storage.allocate();
276 ARCANE_CHECK_HIP(partition_three_way(s.m_algo_storage.address(), temp_storage_size, input_iter, first_output_iter,
277 second_output_iter, unselected_iter, nb_list1_ptr, nb_item,
278 select1_lambda, select2_lambda, stream));
279 s.m_device_nb_list1_storage.
copyToAsync(s.m_host_nb_list1_storage, queue);
282#if defined(ARCANE_COMPILING_SYCL)
283 case eExecutionPolicy::SYCL: {
288 case eExecutionPolicy::Thread:
291 case eExecutionPolicy::Sequential: {
294 for (
Int32 i = 0; i < nb_item; ++i) {
295 auto v = *input_iter;
296 bool is_1 = select1_lambda(v);
297 bool is_2 = select2_lambda(v);
299 *first_output_iter = v;
305 *second_output_iter = v;
306 ++second_output_iter;
310 *unselected_iter = v;
317 s.m_host_nb_list1_storage[0] = nb_first;
318 s.m_host_nb_list1_storage[1] = nb_second;
323 launch_info.endExecute();