85 command << trace_info;
88 switch (exec_policy) {
89#if defined(ARCANE_COMPILING_CUDA)
91 size_t temp_storage_size = 0;
92 cudaStream_t stream = impl::CudaUtils::toNativeStream(&queue);
94 int* nb_list1_ptr =
nullptr;
95 ARCANE_CHECK_CUDA(::cub::DevicePartition::If(
nullptr, temp_storage_size,
96 input_iter, output_iter, nb_list1_ptr, nb_item,
97 select_lambda, stream));
99 s.m_algo_storage.allocate(temp_storage_size);
100 nb_list1_ptr = s.m_device_nb_list1_storage.allocate();
101 ARCANE_CHECK_CUDA(::cub::DevicePartition::If(s.m_algo_storage.address(), temp_storage_size,
102 input_iter, output_iter, nb_list1_ptr, nb_item,
103 select_lambda, stream));
104 s.m_device_nb_list1_storage.
copyToAsync(s.m_host_nb_list1_storage, queue);
107#if defined(ARCANE_COMPILING_HIP)
109 size_t temp_storage_size = 0;
111 hipStream_t stream = impl::HipUtils::toNativeStream(&queue);
112 int* nb_list1_ptr =
nullptr;
113 ARCANE_CHECK_HIP(rocprim::partition(
nullptr, temp_storage_size, input_iter, output_iter,
114 nb_list1_ptr, nb_item, select_lambda, stream));
116 s.m_algo_storage.allocate(temp_storage_size);
117 nb_list1_ptr = s.m_device_nb_list1_storage.allocate();
119 ARCANE_CHECK_HIP(rocprim::partition(s.m_algo_storage.address(), temp_storage_size, input_iter, output_iter,
120 nb_list1_ptr, nb_item, select_lambda, stream));
121 s.m_device_nb_list1_storage.
copyToAsync(s.m_host_nb_list1_storage, queue);
124#if defined(ARCANE_COMPILING_SYCL) && defined(__INTEL_LLVM_COMPILER)
136 using InputDataType =
typename InputIterator::value_type;
137 using DataType =
typename OutputIterator::value_type;
147 tmp_output[i] = input_iter[i];
150 auto tmp_select_lambda = [=](
Int32 i) {
return tmp_select[i]; };
151 sycl::queue sycl_queue = impl::SyclUtils::toNativeStream(queue);
152 auto policy = oneapi::dpl::execution::make_device_policy(sycl_queue);
153 auto output_after = oneapi::dpl::stable_partition(policy, tmp_output.begin(), tmp_output.end(), select_lambda);
155 Int32 nb_list1 = (output_after - tmp_output.begin());
156 Int32 nb_list2 = nb_item - nb_list1;
157 s.m_host_nb_list1_storage[0] = nb_list1;
166 Int32 nb_iter2 = (nb_list2 / 2) + (nb_list2 % 2);
173 Int32 j = i - nb_list1;
174 Int32 reverse_i = (nb_item - (j + 1));
175 auto x1 = tmp_output[i];
176 auto x2 = tmp_output[reverse_i];
177 output_iter[i] = tmp_output[reverse_i];
178 output_iter[reverse_i] = tmp_output[i];
181 output_iter[i] = tmp_output[i];
191 auto saved_output_iter = output_iter;
192 auto output2_iter = output_iter + nb_item;
193 for (
Int32 i = 0; i < nb_item; ++i) {
194 auto v = *input_iter;
195 if (select_lambda(v)) {
205 Int32 nb_list1 =
static_cast<Int32>(output_iter - saved_output_iter);
206 s.m_host_nb_list1_storage[0] = nb_list1;
221 InputIterator input_iter,
222 FirstOutputIterator first_output_iter,
223 SecondOutputIterator second_output_iter,
224 UnselectedIterator unselected_iter,
225 const Select1Lambda& select1_lambda,
226 const Select2Lambda& select2_lambda,
232 command << trace_info;
235 switch (exec_policy) {
236#if defined(ARCANE_COMPILING_CUDA)
238 size_t temp_storage_size = 0;
239 cudaStream_t stream = impl::CudaUtils::toNativeStream(&queue);
241 int* nb_list1_ptr =
nullptr;
242 ARCANE_CHECK_CUDA(::cub::DevicePartition::If(
nullptr, temp_storage_size,
243 input_iter, first_output_iter, second_output_iter,
244 unselected_iter, nb_list1_ptr, nb_item,
245 select1_lambda, select2_lambda, stream));
247 s.m_algo_storage.allocate(temp_storage_size);
248 nb_list1_ptr = s.m_device_nb_list1_storage.allocate();
249 ARCANE_CHECK_CUDA(::cub::DevicePartition::If(s.m_algo_storage.address(), temp_storage_size,
250 input_iter, first_output_iter, second_output_iter,
251 unselected_iter, nb_list1_ptr, nb_item,
252 select1_lambda, select2_lambda, stream));
253 s.m_device_nb_list1_storage.
copyToAsync(s.m_host_nb_list1_storage, queue);
256#if defined(ARCANE_COMPILING_HIP)
258 size_t temp_storage_size = 0;
260 hipStream_t stream = impl::HipUtils::toNativeStream(&queue);
261 int* nb_list1_ptr =
nullptr;
262 using namespace rocprim;
263 ARCANE_CHECK_HIP(::rocprim::partition_three_way(
nullptr, temp_storage_size, input_iter, first_output_iter,
264 second_output_iter, unselected_iter,
265 nb_list1_ptr, nb_item, select1_lambda, select2_lambda, stream));
267 s.m_algo_storage.allocate(temp_storage_size);
268 nb_list1_ptr = s.m_device_nb_list1_storage.allocate();
270 ARCANE_CHECK_HIP(partition_three_way(s.m_algo_storage.address(), temp_storage_size, input_iter, first_output_iter,
271 second_output_iter, unselected_iter, nb_list1_ptr, nb_item,
272 select1_lambda, select2_lambda, stream));
273 s.m_device_nb_list1_storage.
copyToAsync(s.m_host_nb_list1_storage, queue);
282 for (
Int32 i = 0; i < nb_item; ++i) {
283 auto v = *input_iter;
284 bool is_1 = select1_lambda(v);
285 bool is_2 = select2_lambda(v);
287 *first_output_iter = v;
293 *second_output_iter = v;
294 ++second_output_iter;
298 *unselected_iter = v;
305 s.m_host_nb_list1_storage[0] = nb_first;
306 s.m_host_nb_list1_storage[1] = nb_second;