pencil_box/array/
difference.rs

1use std::collections::HashSet;
2use std::hash::Hash;
3use ahash::AHashSet;
4
5/// Computes the difference between a primary list and multiple exclusion lists using [`std::collections::HashSet`].
6///
7/// # Type Parameters
8/// - `T`: The element type. Must implement [`Clone`], [`Eq`], and [`Hash`].
9///
10/// # Arguments
11/// - `to_compare`: A vector of values to retain if not found in `others`.
12/// - `others`: A reference to a list of reference vectors containing values to exclude.
13///
14/// # Returns
15/// A new `Vec<T>` containing only the values from `to_compare` that are not found in any of the `others`.
16///
17/// # Behavior
18/// - Returns all items from `to_compare` that are not present in any exclusion list in `others`.
19/// - Performs equality comparison using `==`, backed by `Eq` + `Hash`.
20///
21/// # Performance
22/// - Uses [`HashSet`] (SipHash): **secure and collision-resistant**, suitable for untrusted input.
23/// - Preallocates capacity for efficiency and avoids unnecessary allocations.
24/// - Performs at most one `clone()` per included or excluded item.
25/// - For large datasets where security is not a concern, see [`difference_performant`].
26///
27/// # Examples
28///
29/// ### ✂️ Filter out excluded values
30/// ```
31/// use pencil_box::array::difference::difference;
32///
33/// let a = vec![1, 2, 3, 4, 5];
34/// let b = vec![2, 4];
35/// let c = vec![5];
36///
37/// let result = difference(&a, &vec![&b, &c]);
38/// assert_eq!(result, vec![1, 3]);
39/// ```
40///
41/// ### 🔤 Works with strings
42/// ```
43/// let a = vec!["apple", "banana", "cherry"];
44/// let b = vec!["banana"];
45/// let result = difference(&a, &vec![&b]);
46/// assert_eq!(result, vec!["apple", "cherry"]);
47/// ```
48///
49/// ### 📭 Handles empty inputs
50/// ```
51/// let a: Vec<i32> = vec![];
52/// let b = vec![1, 2, 3];
53/// let result = difference(&a, &vec![&b]);
54/// assert!(result.is_empty());
55/// ```
56
57pub fn difference<T: Eq + Hash + Clone>(
58    to_compare: &Vec<T>,
59    others: &Vec<&Vec<T>>,
60) -> Vec<T> {
61    let capacity = others.iter().map(|sub_array| sub_array.len()).sum();
62    let mut set: HashSet<T> = HashSet::with_capacity(capacity);
63
64    for arr in others {
65        for item in *arr {
66            set.insert(item.clone()); // clone item into the set
67        }
68    }
69
70    to_compare
71        .iter()
72        .filter(|item| !set.contains(item))
73        .cloned()
74        .collect()
75}
76
77/// Computes the difference between a primary list and multiple exclusion lists using [`AHashSet`] for maximum performance.
78///
79/// # Type Parameters
80/// - `T`: The element type. Must implement [`Clone`], [`Eq`], and [`Hash`].
81///
82/// # Arguments
83/// - `to_compare`: A vector of values to retain if not found in `others`.
84/// - `others`: A reference to a list of reference vectors containing values to exclude.
85///
86/// # Returns
87/// A new `Vec<T>` containing only the values from `to_compare` that are not found in any of the `others`.
88///
89/// # Behavior
90/// - Identical in output to [`difference`], but optimized using [`ahash::AHashSet`] for faster performance.
91/// - Equality comparison based on `==` (requires `Eq` + `Hash`).
92///
93/// # Performance
94/// - ⚡ Uses [`AHashSet`], a fast, non-cryptographic hashing algorithm (Blake3-inspired).
95/// - 🚀 Significantly faster than `HashSet` for large data, but **not DoS-resistant** (not safe for untrusted input).
96/// - Preallocates exclusion set and result vector for efficiency.
97/// - Performs at most one `clone()` per unique value processed.
98///
99/// # Examples
100///
101/// ### 🚀 Fast difference on large numbers
102/// ```
103/// use pencil_box::array::difference::difference_performant;
104///
105/// let a: Vec<_> = (0..100_000).collect();
106/// let b: Vec<_> = (50_000..150_000).collect();
107/// let result = difference_performant(&a, &vec![&b]);
108/// assert_eq!(result.len(), 50_000);
109/// ```
110///
111/// ### ⚠️ Not suitable for hostile input
112/// ```text
113/// AHashSet is not cryptographically secure. Use `difference` with HashSet if you're handling untrusted or externally-supplied keys.
114/// ```
115///
116/// ### ✅ Identical logic to `difference`
117/// ```
118/// let a = vec![1, 2, 3, 4];
119/// let b = vec![2, 4];
120/// let result = difference_performant(&a, &vec![&b]);
121/// assert_eq!(result, vec![1, 3]);
122/// ```
123
124pub fn difference_performant<T: Eq + Hash + Clone>(
125    to_compare: &Vec<T>,
126    others: &Vec<&Vec<T>>,
127) -> Vec<T> {
128    let capacity = others.iter().map(|sub_array| sub_array.len()).sum();
129    let mut set: AHashSet<T> = AHashSet::with_capacity(capacity);
130
131    for arr in others {
132        for item in *arr {
133            set.insert(item.clone()); // clone item into the set
134        }
135    }
136
137    to_compare
138        .iter()
139        .filter(|item| !set.contains(item))
140        .cloned()
141        .collect()
142}