82 command << trace_info;
85 switch (exec_policy) {
86#if defined(ARCANE_COMPILING_CUDA)
88 size_t temp_storage_size = 0;
89 cudaStream_t stream = impl::CudaUtils::toNativeStream(&queue);
91 int* nb_list1_ptr =
nullptr;
92 ARCANE_CHECK_CUDA(::cub::DevicePartition::If(
nullptr, temp_storage_size,
93 input_iter, output_iter, nb_list1_ptr, nb_item,
94 select_lambda, stream));
96 s.m_algo_storage.allocate(temp_storage_size);
97 nb_list1_ptr = s.m_device_nb_list1_storage.allocate();
98 ARCANE_CHECK_CUDA(::cub::DevicePartition::If(s.m_algo_storage.address(), temp_storage_size,
99 input_iter, output_iter, nb_list1_ptr, nb_item,
100 select_lambda, stream));
101 s.m_device_nb_list1_storage.
copyToAsync(s.m_host_nb_list1_storage, queue);
104#if defined(ARCANE_COMPILING_HIP)
106 size_t temp_storage_size = 0;
108 hipStream_t stream = impl::HipUtils::toNativeStream(&queue);
109 int* nb_list1_ptr =
nullptr;
110 ARCANE_CHECK_HIP(rocprim::partition(
nullptr, temp_storage_size, input_iter, output_iter,
111 nb_list1_ptr, nb_item, select_lambda, stream));
113 s.m_algo_storage.allocate(temp_storage_size);
114 nb_list1_ptr = s.m_device_nb_list1_storage.allocate();
116 ARCANE_CHECK_HIP(rocprim::partition(s.m_algo_storage.address(), temp_storage_size, input_iter, output_iter,
117 nb_list1_ptr, nb_item, select_lambda, stream));
118 s.m_device_nb_list1_storage.
copyToAsync(s.m_host_nb_list1_storage, queue);
126 auto saved_output_iter = output_iter;
127 auto output2_iter = output_iter + nb_item;
128 for (
Int32 i = 0; i < nb_item; ++i) {
129 auto v = *input_iter;
130 if (select_lambda(v)) {
140 Int32 nb_list1 =
static_cast<Int32>(output_iter - saved_output_iter);
141 s.m_host_nb_list1_storage[0] = nb_list1;
156 InputIterator input_iter,
157 FirstOutputIterator first_output_iter,
158 SecondOutputIterator second_output_iter,
159 UnselectedIterator unselected_iter,
160 const Select1Lambda& select1_lambda,
161 const Select2Lambda& select2_lambda,
167 command << trace_info;
170 switch (exec_policy) {
171#if defined(ARCANE_COMPILING_CUDA)
173 size_t temp_storage_size = 0;
174 cudaStream_t stream = impl::CudaUtils::toNativeStream(&queue);
176 int* nb_list1_ptr =
nullptr;
177 ARCANE_CHECK_CUDA(::cub::DevicePartition::If(
nullptr, temp_storage_size,
178 input_iter, first_output_iter, second_output_iter,
179 unselected_iter, nb_list1_ptr, nb_item,
180 select1_lambda, select2_lambda, stream));
182 s.m_algo_storage.allocate(temp_storage_size);
183 nb_list1_ptr = s.m_device_nb_list1_storage.allocate();
184 ARCANE_CHECK_CUDA(::cub::DevicePartition::If(s.m_algo_storage.address(), temp_storage_size,
185 input_iter, first_output_iter, second_output_iter,
186 unselected_iter, nb_list1_ptr, nb_item,
187 select1_lambda, select2_lambda, stream));
188 s.m_device_nb_list1_storage.
copyToAsync(s.m_host_nb_list1_storage, queue);
191#if defined(ARCANE_COMPILING_HIP)
193 size_t temp_storage_size = 0;
195 hipStream_t stream = impl::HipUtils::toNativeStream(&queue);
196 int* nb_list1_ptr =
nullptr;
197 using namespace rocprim;
198 ARCANE_CHECK_HIP(::rocprim::partition_three_way(
nullptr, temp_storage_size, input_iter, first_output_iter,
199 second_output_iter, unselected_iter,
200 nb_list1_ptr, nb_item, select1_lambda, select2_lambda, stream));
202 s.m_algo_storage.allocate(temp_storage_size);
203 nb_list1_ptr = s.m_device_nb_list1_storage.allocate();
205 ARCANE_CHECK_HIP(partition_three_way(s.m_algo_storage.address(), temp_storage_size, input_iter, first_output_iter,
206 second_output_iter, unselected_iter, nb_list1_ptr, nb_item,
207 select1_lambda, select2_lambda, stream));
208 s.m_device_nb_list1_storage.
copyToAsync(s.m_host_nb_list1_storage, queue);
218 for (
Int32 i = 0; i < nb_item; ++i) {
219 auto v = *input_iter;
220 bool is_1 = select1_lambda(v);
221 bool is_2 = select2_lambda(v);
223 *first_output_iter = v;
229 *second_output_iter = v;
230 ++second_output_iter;
234 *unselected_iter = v;
241 s.m_host_nb_list1_storage[0] = nb_first;
242 s.m_host_nb_list1_storage[1] = nb_second;