1use crate::array::Array;
2use crate::error::{NumRs2Error, Result};
3use num_traits::Zero;
4use scirs2_core::parallel_ops::*;
5use std::collections::HashMap;
6use std::collections::HashSet;
7use std::fmt::Debug;
8use std::hash::Hash;
9use std::sync::atomic::{AtomicUsize, Ordering};
10
11pub fn unique_optimized<T>(
35 a: &Array<T>,
36 axis: Option<usize>,
37 return_index: Option<bool>,
38 return_inverse: Option<bool>,
39 return_counts: Option<bool>,
40) -> Result<UniqueResult<T>>
41where
42 T: Clone + Hash + Eq + Debug + Zero + Send + Sync,
43{
44 fn estimate_capacity(array_size: usize) -> usize {
46 if array_size < 1000 {
49 array_size
50 } else {
51 (array_size as f64 * 0.9) as usize
52 }
53 }
54
55 if axis.is_none() {
57 let flat_data = a.to_vec();
58 let array_size = flat_data.len();
59
60 if array_size > 10000 {
62 return unique_optimized_large(
63 &flat_data,
64 array_size,
65 return_index,
66 return_inverse,
67 return_counts,
68 );
69 }
70
71 let estimated_capacity = estimate_capacity(array_size);
73
74 let mut unique_elements = Vec::with_capacity(estimated_capacity);
76 let mut first_indices = if return_index.unwrap_or(false) {
77 Vec::with_capacity(estimated_capacity)
78 } else {
79 Vec::new()
80 };
81 let mut inverse_indices = if return_inverse.unwrap_or(false) {
82 vec![0; array_size]
83 } else {
84 Vec::new()
85 };
86 let need_inverse = return_inverse.unwrap_or(false);
87
88 let mut value_to_index = HashMap::with_capacity(estimated_capacity);
90
91 for (i, value) in flat_data.iter().enumerate() {
93 if let Some(&idx) = value_to_index.get(value) {
94 if need_inverse {
96 inverse_indices[i] = idx;
97 }
98 } else {
99 let new_idx = unique_elements.len();
101 unique_elements.push(value.clone());
102 if return_index.unwrap_or(false) {
103 first_indices.push(i);
104 }
105 value_to_index.insert(value, new_idx);
106 if need_inverse {
107 inverse_indices[i] = new_idx;
108 }
109 }
110 }
111
112 let counts = if return_counts.unwrap_or(false) {
114 let mut counts_vec = vec![0; unique_elements.len()];
115 if need_inverse {
116 for &idx in &inverse_indices {
118 counts_vec[idx] += 1;
119 }
120 } else {
121 for value in flat_data.iter() {
123 let idx = *value_to_index
124 .get(value)
125 .expect("value must exist in value_to_index map");
126 counts_vec[idx] += 1;
127 }
128 }
129 Some(Array::from_vec(counts_vec))
130 } else {
131 None
132 };
133
134 let unique_array = Array::from_vec(unique_elements);
136
137 return Ok(UniqueResult {
138 values: unique_array,
139 indices: if return_index.unwrap_or(false) {
140 Some(Array::from_vec(first_indices))
141 } else {
142 None
143 },
144 inverse: if return_inverse.unwrap_or(false) {
145 Some(Array::from_vec(inverse_indices))
146 } else {
147 None
148 },
149 counts,
150 });
151 }
152
153 let axis_val = axis.expect("axis must be Some at this point (None case handled above)");
155 if axis_val >= a.ndim() {
156 return Err(NumRs2Error::DimensionMismatch(format!(
157 "Axis {} out of bounds for array of dimension {}",
158 axis_val,
159 a.ndim()
160 )));
161 }
162
163 let shape = a.shape();
165
166 if shape.len() == 1 && axis_val == 0 {
168 return unique_optimized(a, None, return_index, return_inverse, return_counts);
169 }
170
171 let axis_len = shape[axis_val];
175
176 let mut subarrays = Vec::with_capacity(axis_len);
178 let mut subarray_hashes = Vec::with_capacity(axis_len);
179
180 for i in 0..axis_len {
182 let subarray = a.slice(axis_val, i)?;
184
185 let hash_rep = subarray.to_vec();
187
188 subarrays.push(subarray);
189 subarray_hashes.push(hash_rep);
190 }
191
192 let estimated_capacity = estimate_capacity(axis_len);
194
195 let mut unique_indices = Vec::with_capacity(estimated_capacity);
197 let mut index_map = HashMap::with_capacity(estimated_capacity);
198 let mut inverse = if return_inverse.unwrap_or(false) {
199 vec![0; axis_len]
200 } else {
201 Vec::new()
202 };
203 let need_inverse = return_inverse.unwrap_or(false);
204 let mut seen = HashSet::with_capacity(estimated_capacity);
205
206 for i in 0..axis_len {
207 let hash_rep = &subarray_hashes[i];
208
209 if !seen.contains(hash_rep) {
210 let idx = unique_indices.len();
212 unique_indices.push(i);
213 index_map.insert(hash_rep, idx);
214 seen.insert(hash_rep.clone());
215 if need_inverse {
216 inverse[i] = idx;
217 }
218 } else {
219 if need_inverse {
221 let idx = *index_map
222 .get(hash_rep)
223 .expect("hash_rep must exist in index_map for seen subarrays");
224 inverse[i] = idx;
225 }
226 }
227 }
228
229 let counts = if return_counts.unwrap_or(false) {
231 let mut counts_vec = vec![0; unique_indices.len()];
232 if need_inverse {
233 for &idx in &inverse {
234 counts_vec[idx] += 1;
235 }
236 } else {
237 for hash_rep in &subarray_hashes {
238 if let Some(&idx) = index_map.get(hash_rep) {
239 counts_vec[idx] += 1;
240 }
241 }
242 }
243 Some(Array::from_vec(counts_vec))
244 } else {
245 None
246 };
247
248 let mut output_shape = shape.clone();
252 output_shape[axis_val] = unique_indices.len();
253
254 let mut unique_subarrays = Vec::with_capacity(unique_indices.len());
256 for &idx in &unique_indices {
257 unique_subarrays.push(&subarrays[idx]);
258 }
259
260 let values = if !unique_subarrays.is_empty() {
262 let mut unique_data = Vec::new();
265 for &idx in &unique_indices {
266 unique_data.extend_from_slice(&subarray_hashes[idx]);
267 }
268 Array::from_vec(unique_data).reshape(&output_shape)
269 } else {
270 Array::zeros(&output_shape)
272 };
273
274 Ok(UniqueResult {
275 values,
276 indices: if return_index.unwrap_or(false) {
277 Some(Array::from_vec(unique_indices))
278 } else {
279 None
280 },
281 inverse: if return_inverse.unwrap_or(false) {
282 Some(Array::from_vec(inverse))
283 } else {
284 None
285 },
286 counts,
287 })
288}
289
290fn unique_optimized_large<T>(
292 flat_data: &[T],
293 array_size: usize,
294 return_index: Option<bool>,
295 return_inverse: Option<bool>,
296 return_counts: Option<bool>,
297) -> Result<UniqueResult<T>>
298where
299 T: Clone + Hash + Eq + Debug + Send + Sync,
300{
301 let need_index = return_index.unwrap_or(false);
302 let need_inverse = return_inverse.unwrap_or(false);
303 let need_counts = return_counts.unwrap_or(false);
304
305 let unique_counter = AtomicUsize::new(0);
307
308 let estimated_capacity = (array_size as f64 * 0.9) as usize;
311 let value_to_index = std::sync::RwLock::new(HashMap::with_capacity(estimated_capacity));
312
313 let mut unique_elements = Vec::with_capacity(estimated_capacity);
315 let mut first_indices = if need_index {
316 Vec::with_capacity(estimated_capacity)
317 } else {
318 Vec::new()
319 };
320
321 let batch_size = std::cmp::max(1, array_size / scirs2_core::parallel_ops::num_threads());
323 let batches = flat_data.chunks(batch_size);
324
325 batches.enumerate().for_each(|(batch_idx, batch)| {
327 let mut local_uniques = HashMap::new();
328 let base_index = batch_idx * batch_size;
329
330 for (local_idx, value) in batch.iter().enumerate() {
332 let global_idx = base_index + local_idx;
333 if !local_uniques.contains_key(value) {
334 local_uniques.insert(value.clone(), global_idx);
335 }
336 }
337
338 let mut value_map = value_to_index
340 .write()
341 .expect("value_to_index RwLock poisoned: failed to acquire write lock");
342 for (value, local_first_idx) in local_uniques {
343 if !value_map.contains_key(&value) {
344 let new_idx = unique_counter.fetch_add(1, Ordering::SeqCst);
345 value_map.insert(value.clone(), new_idx);
346
347 synchronized_push(&mut unique_elements, value);
349 if need_index {
350 synchronized_push(&mut first_indices, local_first_idx);
351 }
352 }
353 }
354 });
355
356 let inverse_indices = if need_inverse {
358 let value_map = value_to_index
359 .read()
360 .expect("value_to_index RwLock poisoned: failed to acquire read lock");
361 flat_data
362 .par_iter()
363 .map(|value| {
364 *value_map
365 .get(value)
366 .expect("value must exist in value_map for inverse indices")
367 })
368 .collect()
369 } else {
370 Vec::new()
371 };
372
373 let counts = if need_counts {
375 let mut counts_vec = vec![0; unique_elements.len()];
376
377 if need_inverse {
378 for &idx in &inverse_indices {
380 counts_vec[idx] += 1;
381 }
382 } else {
383 let value_map = value_to_index
385 .read()
386 .expect("value_to_index RwLock poisoned: failed to acquire read lock");
387
388 let local_counts = flat_data
390 .par_iter()
391 .map(|value| {
392 let idx = *value_map
393 .get(value)
394 .expect("value must exist in value_map for counting");
395 (idx, 1)
396 })
397 .collect::<Vec<(usize, usize)>>();
398
399 for (idx, count) in local_counts {
401 counts_vec[idx] += count;
402 }
403 }
404
405 Some(Array::from_vec(counts_vec))
406 } else {
407 None
408 };
409
410 let unique_array = Array::from_vec(unique_elements);
412
413 Ok(UniqueResult {
414 values: unique_array,
415 indices: if need_index {
416 Some(Array::from_vec(first_indices))
417 } else {
418 None
419 },
420 inverse: if need_inverse {
421 Some(Array::from_vec(inverse_indices))
422 } else {
423 None
424 },
425 counts,
426 })
427}
428
429fn synchronized_push<T: Send + Clone>(vec: &mut Vec<T>, value: T) {
431 let mutex = std::sync::Mutex::new(());
434 let guard = mutex.lock().expect("Mutex poisoned in synchronized_push");
435 vec.push(value);
436 drop(guard);
437}
438
439pub struct UniqueResult<T> {
452 pub values: Array<T>,
453 pub indices: Option<Array<usize>>,
454 pub inverse: Option<Array<usize>>,
455 pub counts: Option<Array<usize>>,
456}
457
458impl<T: Clone> UniqueResult<T> {
459 pub fn values(self) -> Array<T> {
461 self.values
462 }
463
464 pub fn values_indices(self) -> Result<(Array<T>, Array<usize>)> {
466 match self.indices {
467 Some(indices) => Ok((self.values, indices)),
468 None => Err(NumRs2Error::InvalidOperation(
469 "indices were not requested in the unique call".to_string(),
470 )),
471 }
472 }
473
474 pub fn values_inverse(self) -> Result<(Array<T>, Array<usize>)> {
476 match self.inverse {
477 Some(inverse) => Ok((self.values, inverse)),
478 None => Err(NumRs2Error::InvalidOperation(
479 "inverse was not requested in the unique call".to_string(),
480 )),
481 }
482 }
483
484 pub fn values_counts(self) -> Result<(Array<T>, Array<usize>)> {
486 match self.counts {
487 Some(counts) => Ok((self.values, counts)),
488 None => Err(NumRs2Error::InvalidOperation(
489 "counts were not requested in the unique call".to_string(),
490 )),
491 }
492 }
493
494 pub fn values_indices_inverse(self) -> Result<(Array<T>, Array<usize>, Array<usize>)> {
496 match (self.indices, self.inverse) {
497 (Some(indices), Some(inverse)) => Ok((self.values, indices, inverse)),
498 _ => Err(NumRs2Error::InvalidOperation(
499 "either indices or inverse were not requested in the unique call".to_string(),
500 )),
501 }
502 }
503
504 pub fn values_indices_counts(self) -> Result<(Array<T>, Array<usize>, Array<usize>)> {
506 match (self.indices, self.counts) {
507 (Some(indices), Some(counts)) => Ok((self.values, indices, counts)),
508 _ => Err(NumRs2Error::InvalidOperation(
509 "either indices or counts were not requested in the unique call".to_string(),
510 )),
511 }
512 }
513
514 pub fn values_inverse_counts(self) -> Result<(Array<T>, Array<usize>, Array<usize>)> {
516 match (self.inverse, self.counts) {
517 (Some(inverse), Some(counts)) => Ok((self.values, inverse, counts)),
518 _ => Err(NumRs2Error::InvalidOperation(
519 "either inverse or counts were not requested in the unique call".to_string(),
520 )),
521 }
522 }
523
524 pub fn values_indices_inverse_counts(self) -> Result<crate::unique::UniqueTuple<T>> {
526 match (self.indices, self.inverse, self.counts) {
527 (Some(indices), Some(inverse), Some(counts)) => {
528 Ok((self.values, indices, inverse, counts))
529 }
530 _ => Err(NumRs2Error::InvalidOperation(
531 "not all of indices, inverse, and counts were requested in the unique call"
532 .to_string(),
533 )),
534 }
535 }
536}