Skip to main content

matchkit/
match_set.rs

1use crate::Match;
2
3/// Sorted, deduplicated collection of matches with efficient insertion.
4///
5/// Ensures elements are consistently ordered and handles operations
6/// like duplicate removal and overlapping match merges.
7///
8/// # Example
9///
10/// ```rust
11/// use matchkit::{Match, MatchSet};
12///
13/// let mut set = MatchSet::new();
14/// set.insert(Match::from_parts(1, 0, 10));
15/// set.insert(Match::from_parts(1, 5, 15));
16/// set.merge_overlapping();
17///
18/// assert_eq!(set.len(), 1);
19/// assert_eq!(set.as_slice()[0].end, 15);
20/// ```
21#[derive(Debug, Clone, Default)]
22pub struct MatchSet {
23    matches: Vec<Match>,
24}
25
26impl MatchSet {
27    /// Create an empty match set.
28    #[must_use]
29    pub fn new() -> Self {
30        Self {
31            matches: Vec::new(),
32        }
33    }
34
35    /// Create a match set with pre-allocated capacity.
36    pub fn try_with_capacity(cap: usize) -> crate::error::Result<Self> {
37        let mut vec = Vec::new();
38        vec.try_reserve(cap).map_err(|e| {
39            crate::error::Error::OutOfMemory {
40                message: e.to_string(),
41            }
42        })?;
43        Ok(Self { matches: vec })
44    }
45
46    /// Create a match set with pre-allocated capacity (legacy interface, may panic on OOM).
47    #[must_use]
48    pub fn with_capacity(cap: usize) -> Self {
49        // Clamp to the maximum representable capacity for a Vec<Match> to prevent
50        // unbounded allocation requests from untrusted input.
51        let max_cap = (isize::MAX as usize) / std::mem::size_of::<Match>();
52        let cap = cap.min(max_cap);
53        Self {
54            matches: Vec::with_capacity(cap),
55        }
56    }
57
58    /// Insert a match, maintaining sorted order. O(log n) search + O(n) shift.
59    pub fn try_insert(&mut self, m: Match) -> crate::error::Result<()> {
60        if let Err(pos) = self.matches.binary_search(&m) {
61            self.matches.try_reserve(1).map_err(|e| {
62                crate::error::Error::OutOfMemory {
63                    message: e.to_string(),
64                }
65            })?;
66            self.matches.insert(pos, m);
67        }
68        Ok(())
69    }
70
71    /// Insert a match, maintaining sorted order (legacy interface, may panic on OOM).
72    pub fn insert(&mut self, m: Match) {
73        match self.matches.binary_search(&m) {
74            Ok(_) => {} // duplicate — skip
75            Err(pos) => self.matches.insert(pos, m),
76        }
77    }
78
79    /// Extend with multiple matches, then sort and dedup.
80    pub fn try_extend(
81        &mut self,
82        iter: impl IntoIterator<Item = Match>,
83    ) -> crate::error::Result<()> {
84        let iter = iter.into_iter();
85        let (lower, _) = iter.size_hint();
86        if lower > 0 {
87            self.matches.try_reserve(lower).map_err(|e| {
88                crate::error::Error::OutOfMemory {
89                    message: e.to_string(),
90                }
91            })?;
92        }
93        for m in iter {
94            self.matches.push(m);
95        }
96        self.matches.sort_unstable();
97        self.matches.dedup();
98        Ok(())
99    }
100
101    /// Extend with multiple matches, then sort and dedup (legacy interface, may panic on OOM).
102    pub fn extend(&mut self, iter: impl IntoIterator<Item = Match>) {
103        self.matches.extend(iter);
104        self.matches.sort_unstable();
105        self.matches.dedup();
106    }
107
108    /// Merge overlapping matches into a minimal covering set.
109    ///
110    /// After merging, no two matches in the set overlap.
111    /// Pattern ID is taken from the first match in each merged group.
112    pub fn try_merge_overlapping(&mut self) -> crate::error::Result<()> {
113        if self.matches.len() < 2 {
114            return Ok(());
115        }
116        let mut merged = Vec::new();
117        merged.try_reserve(self.matches.len()).map_err(|e| {
118            crate::error::Error::OutOfMemory {
119                message: e.to_string(),
120            }
121        })?;
122        let mut current = self.matches[0];
123
124        for m in &self.matches[1..] {
125            if current.overlaps(m)
126                || (current.pattern_id == m.pattern_id && current.end == m.start)
127            {
128                current.end = current.end.max(m.end);
129            } else {
130                merged.push(current);
131                current = *m;
132            }
133        }
134        merged.push(current);
135        self.matches = merged;
136        Ok(())
137    }
138
139    /// Merge overlapping matches into a minimal covering set (legacy interface, may panic on OOM).
140    pub fn merge_overlapping(&mut self) {
141        if self.matches.len() < 2 {
142            return;
143        }
144        let mut merged: Vec<Match> = Vec::with_capacity(self.matches.len());
145        let mut current = self.matches[0];
146
147        for m in &self.matches[1..] {
148            if current.overlaps(m)
149                || (current.pattern_id == m.pattern_id && current.end == m.start)
150            {
151                // Extend current to cover both (overlapping, or adjacent with same pattern)
152                current.end = current.end.max(m.end);
153            } else {
154                merged.push(current);
155                current = *m;
156            }
157        }
158        merged.push(current);
159        self.matches = merged;
160    }
161
162    /// Number of matches in the set.
163    #[must_use]
164    pub fn len(&self) -> usize {
165        self.matches.len()
166    }
167
168    /// Whether the set is empty.
169    #[must_use]
170    pub fn is_empty(&self) -> bool {
171        self.matches.is_empty()
172    }
173
174    /// Get matches as a slice.
175    #[must_use]
176    pub fn as_slice(&self) -> &[Match] {
177        &self.matches
178    }
179
180    /// Returns an iterator over the matches.
181    pub fn iter(&self) -> std::slice::Iter<'_, Match> {
182        self.matches.iter()
183    }
184
185    /// Consume the set into a Vec.
186    #[must_use]
187    pub fn into_vec(self) -> Vec<Match> {
188        self.matches
189    }
190
191    /// Filter matches to only those with the given pattern ID.
192    #[must_use]
193    pub fn filter_by_pattern(&self, pattern_id: u32) -> Self {
194        Self {
195            matches: self
196                .matches
197                .iter()
198                .copied()
199                .filter(|m| m.pattern_id == pattern_id)
200                .collect(),
201        }
202    }
203
204    /// Filter matches to only those with the given pattern ID, returning an error on OOM.
205    pub fn try_filter_by_pattern(&self, pattern_id: u32) -> crate::error::Result<Self> {
206        let mut matches = Vec::new();
207        matches.try_reserve(self.matches.len()).map_err(|e| {
208            crate::error::Error::OutOfMemory {
209                message: e.to_string(),
210            }
211        })?;
212        for m in &self.matches {
213            if m.pattern_id == pattern_id {
214                matches.push(*m);
215            }
216        }
217        Ok(Self { matches })
218    }
219
220    /// Count matches for each pattern ID.
221    #[must_use]
222    pub fn pattern_counts(&self) -> std::collections::HashMap<u32, usize> {
223        let mut counts = std::collections::HashMap::new();
224        for m in &self.matches {
225            *counts.entry(m.pattern_id).or_insert(0) += 1;
226        }
227        counts
228    }
229
230    /// Count matches for each pattern ID, returning an error on OOM.
231    pub fn try_pattern_counts(&self) -> crate::error::Result<std::collections::HashMap<u32, usize>> {
232        let mut counts = std::collections::HashMap::new();
233        counts.try_reserve(self.matches.len()).map_err(|e| {
234            crate::error::Error::OutOfMemory {
235                message: e.to_string(),
236            }
237        })?;
238        for m in &self.matches {
239            *counts.entry(m.pattern_id).or_insert(0) += 1;
240        }
241        Ok(counts)
242    }
243
244    /// Distinct pattern IDs in the set.
245    #[must_use]
246    pub fn pattern_ids(&self) -> Vec<u32> {
247        let mut ids: Vec<u32> = self.matches.iter().map(|m| m.pattern_id).collect();
248        ids.sort_unstable();
249        ids.dedup();
250        ids
251    }
252
253    /// Distinct pattern IDs in the set, returning an error on OOM.
254    pub fn try_pattern_ids(&self) -> crate::error::Result<Vec<u32>> {
255        let mut ids = Vec::new();
256        ids.try_reserve(self.matches.len()).map_err(|e| {
257            crate::error::Error::OutOfMemory {
258                message: e.to_string(),
259            }
260        })?;
261        for m in &self.matches {
262            ids.push(m.pattern_id);
263        }
264        ids.sort_unstable();
265        ids.dedup();
266        Ok(ids)
267    }
268}
269
270impl IntoIterator for MatchSet {
271    type Item = Match;
272    type IntoIter = std::vec::IntoIter<Match>;
273
274    fn into_iter(self) -> Self::IntoIter {
275        self.matches.into_iter()
276    }
277}
278
279impl<'a> IntoIterator for &'a MatchSet {
280    type Item = &'a Match;
281    type IntoIter = std::slice::Iter<'a, Match>;
282
283    fn into_iter(self) -> Self::IntoIter {
284        self.iter()
285    }
286}