86 command << trace_info;
89 switch (exec_policy) {
90#if defined(ARCANE_COMPILING_CUDA)
92 size_t temp_storage_size = 0;
93 cudaStream_t stream = impl::CudaUtils::toNativeStream(&queue);
95 int* nb_list1_ptr =
nullptr;
96 ARCANE_CHECK_CUDA(::cub::DevicePartition::If(
nullptr, temp_storage_size,
97 input_iter, output_iter, nb_list1_ptr, nb_item,
98 select_lambda, stream));
100 s.m_algo_storage.allocate(temp_storage_size);
101 nb_list1_ptr = s.m_device_nb_list1_storage.allocate();
102 ARCANE_CHECK_CUDA(::cub::DevicePartition::If(s.m_algo_storage.address(), temp_storage_size,
103 input_iter, output_iter, nb_list1_ptr, nb_item,
104 select_lambda, stream));
105 s.m_device_nb_list1_storage.
copyToAsync(s.m_host_nb_list1_storage, queue);
108#if defined(ARCANE_COMPILING_HIP)
110 size_t temp_storage_size = 0;
112 hipStream_t stream = impl::HipUtils::toNativeStream(&queue);
113 int* nb_list1_ptr =
nullptr;
114 ARCANE_CHECK_HIP(rocprim::partition(
nullptr, temp_storage_size, input_iter, output_iter,
115 nb_list1_ptr, nb_item, select_lambda, stream));
117 s.m_algo_storage.allocate(temp_storage_size);
118 nb_list1_ptr = s.m_device_nb_list1_storage.allocate();
120 ARCANE_CHECK_HIP(rocprim::partition(s.m_algo_storage.address(), temp_storage_size, input_iter, output_iter,
121 nb_list1_ptr, nb_item, select_lambda, stream));
122 s.m_device_nb_list1_storage.
copyToAsync(s.m_host_nb_list1_storage, queue);
125#if defined(ARCANE_COMPILING_SYCL) && defined(__INTEL_LLVM_COMPILER)
137 using InputDataType =
typename InputIterator::value_type;
138 using DataType =
typename OutputIterator::value_type;
148 tmp_output[i] = input_iter[i];
151 auto tmp_select_lambda = [=](
Int32 i) {
return tmp_select[i]; };
152 sycl::queue sycl_queue = impl::SyclUtils::toNativeStream(queue);
153 auto policy = oneapi::dpl::execution::make_device_policy(sycl_queue);
154 auto output_after = oneapi::dpl::stable_partition(policy, tmp_output.begin(), tmp_output.end(), select_lambda);
156 Int32 nb_list1 = (output_after - tmp_output.begin());
157 Int32 nb_list2 = nb_item - nb_list1;
158 s.m_host_nb_list1_storage[0] = nb_list1;
167 Int32 nb_iter2 = (nb_list2 / 2) + (nb_list2 % 2);
174 Int32 j = i - nb_list1;
175 Int32 reverse_i = (nb_item - (j + 1));
176 auto x1 = tmp_output[i];
177 auto x2 = tmp_output[reverse_i];
178 output_iter[i] = tmp_output[reverse_i];
179 output_iter[reverse_i] = tmp_output[i];
182 output_iter[i] = tmp_output[i];
192 auto saved_output_iter = output_iter;
193 auto output2_iter = output_iter + nb_item;
194 for (
Int32 i = 0; i < nb_item; ++i) {
195 auto v = *input_iter;
196 if (select_lambda(v)) {
206 Int32 nb_list1 =
static_cast<Int32>(output_iter - saved_output_iter);
207 s.m_host_nb_list1_storage[0] = nb_list1;
222 InputIterator input_iter,
223 FirstOutputIterator first_output_iter,
224 SecondOutputIterator second_output_iter,
225 UnselectedIterator unselected_iter,
226 const Select1Lambda& select1_lambda,
227 const Select2Lambda& select2_lambda,
233 command << trace_info;
236 switch (exec_policy) {
237#if defined(ARCANE_COMPILING_CUDA)
239 size_t temp_storage_size = 0;
240 cudaStream_t stream = impl::CudaUtils::toNativeStream(&queue);
242 int* nb_list1_ptr =
nullptr;
243 ARCANE_CHECK_CUDA(::cub::DevicePartition::If(
nullptr, temp_storage_size,
244 input_iter, first_output_iter, second_output_iter,
245 unselected_iter, nb_list1_ptr, nb_item,
246 select1_lambda, select2_lambda, stream));
248 s.m_algo_storage.allocate(temp_storage_size);
249 nb_list1_ptr = s.m_device_nb_list1_storage.allocate();
250 ARCANE_CHECK_CUDA(::cub::DevicePartition::If(s.m_algo_storage.address(), temp_storage_size,
251 input_iter, first_output_iter, second_output_iter,
252 unselected_iter, nb_list1_ptr, nb_item,
253 select1_lambda, select2_lambda, stream));
254 s.m_device_nb_list1_storage.
copyToAsync(s.m_host_nb_list1_storage, queue);
257#if defined(ARCANE_COMPILING_HIP)
259 size_t temp_storage_size = 0;
261 hipStream_t stream = impl::HipUtils::toNativeStream(&queue);
262 int* nb_list1_ptr =
nullptr;
263 using namespace rocprim;
264 ARCANE_CHECK_HIP(::rocprim::partition_three_way(
nullptr, temp_storage_size, input_iter, first_output_iter,
265 second_output_iter, unselected_iter,
266 nb_list1_ptr, nb_item, select1_lambda, select2_lambda, stream));
268 s.m_algo_storage.allocate(temp_storage_size);
269 nb_list1_ptr = s.m_device_nb_list1_storage.allocate();
271 ARCANE_CHECK_HIP(partition_three_way(s.m_algo_storage.address(), temp_storage_size, input_iter, first_output_iter,
272 second_output_iter, unselected_iter, nb_list1_ptr, nb_item,
273 select1_lambda, select2_lambda, stream));
274 s.m_device_nb_list1_storage.
copyToAsync(s.m_host_nb_list1_storage, queue);
283 for (
Int32 i = 0; i < nb_item; ++i) {
284 auto v = *input_iter;
285 bool is_1 = select1_lambda(v);
286 bool is_2 = select2_lambda(v);
288 *first_output_iter = v;
294 *second_output_iter = v;
295 ++second_output_iter;
299 *unselected_iter = v;
306 s.m_host_nb_list1_storage[0] = nb_first;
307 s.m_host_nb_list1_storage[1] = nb_second;