Skip to main content

glob_set/
tinyset.rs

1//! A lightweight glob set using linear scans instead of trie indexing.
2//!
3//! [`TinyGlobSet`] uses the same pattern classification as [`GlobSet`] (extension,
4//! literal, prefix, suffix, Aho-Corasick) but stores prefix and suffix entries in
5//! plain `Vec`s and matches them with linear scans. This makes it **~2.5x faster
6//! to build** than `GlobSet` at the cost of **~20% slower queries** on typical
7//! workloads.
8//!
9//! # When to use `TinyGlobSet`
10//!
11//! - **Few prefix/suffix patterns** (roughly < 20) where the linear scan stays
12//!   fast and the build-time savings dominate.
13//! - **Short-lived or frequently rebuilt** sets, e.g. per-request pattern
14//!   matching where construction cost matters more than query throughput.
15//! - **Minimal memory overhead** is important — no double-array trie allocation.
16//!
17//! # When to use [`GlobSet`] instead
18//!
19//! - **Many prefix/suffix patterns** — `GlobSet` uses a double-array trie with
20//!   O(key-length) lookups regardless of how many patterns share the same
21//!   strategy bucket.
22//! - **Query-heavy workloads** where the set is built once and matched millions
23//!   of times (e.g. file-tree walking).
24//! - **Priority-aware matching** — `GlobSet` supports [`GlobSet::first_match`]
25//!   and powers [`GlobMap`], which `TinyGlobSet` does not.
26//!
27//! [`GlobSet`]: crate::GlobSet
28//! [`GlobSet::first_match`]: crate::GlobSet::first_match
29//! [`GlobMap`]: crate::GlobMap
30
31#![allow(clippy::missing_errors_doc)]
32
33use alloc::string::String;
34use alloc::vec;
35use alloc::vec::Vec;
36
37use aho_corasick::AhoCorasick;
38use hashbrown::HashMap;
39
40use crate::glob::{Candidate, Glob};
41use crate::literal;
42use crate::strategy;
43
44/// A lightweight glob set that trades query speed for fast construction.
45///
46/// See the [module docs](self) for guidance on when to choose this over
47/// [`GlobSet`](crate::GlobSet).
48#[derive(Clone, Debug, Default)]
49pub struct TinyGlobSet {
50    patterns: Vec<Glob>,
51    ext_any: HashMap<String, Vec<usize>>,
52    ext_local: HashMap<String, Vec<usize>>,
53    literals: HashMap<String, usize>,
54    prefixes: Vec<(String, usize)>,
55    suffixes: Vec<(String, usize)>,
56    compound_suffixes: Vec<(String, usize)>,
57    prefix_suffixes: Vec<(String, String, usize)>,
58    ac: Option<AhoCorasick>,
59    ac_to_glob: Vec<usize>,
60    always_check: Vec<usize>,
61}
62
63impl TinyGlobSet {
64    pub fn len(&self) -> usize {
65        self.patterns.len()
66    }
67
68    pub fn is_empty(&self) -> bool {
69        self.patterns.is_empty()
70    }
71
72    pub fn is_match(&self, path: impl AsRef<str>) -> bool {
73        let path = path.as_ref();
74        if self.patterns.is_empty() {
75            return false;
76        }
77
78        if let Some(ext) = strategy::path_extension(path) {
79            if self.ext_any.contains_key(ext) {
80                return true;
81            }
82            if let Some(indices) = self.ext_local.get(ext) {
83                for &idx in indices {
84                    if glob_matcher::glob_match(self.patterns[idx].glob(), path) {
85                        return true;
86                    }
87                }
88            }
89        }
90
91        if self.literals.contains_key(path) {
92            return true;
93        }
94
95        for (prefix, _) in &self.prefixes {
96            if path.starts_with(prefix.as_str()) {
97                return true;
98            }
99        }
100
101        for (suffix, _) in &self.suffixes {
102            if path.ends_with(suffix.as_str()) || path == &suffix[1..] {
103                return true;
104            }
105        }
106
107        for (suffix, _) in &self.compound_suffixes {
108            if path.ends_with(suffix.as_str()) {
109                return true;
110            }
111        }
112
113        for (prefix, suffix, _) in &self.prefix_suffixes {
114            if path.starts_with(prefix.as_str()) && path.ends_with(suffix.as_str()) {
115                return true;
116            }
117        }
118
119        for &idx in &self.always_check {
120            if glob_matcher::glob_match(self.patterns[idx].glob(), path) {
121                return true;
122            }
123        }
124
125        if let Some(ac) = &self.ac {
126            for mat in ac.find_overlapping_iter(path) {
127                let glob_idx = self.ac_to_glob[mat.pattern().as_usize()];
128                if glob_matcher::glob_match(self.patterns[glob_idx].glob(), path) {
129                    return true;
130                }
131            }
132        }
133
134        false
135    }
136
137    pub fn is_match_candidate(&self, candidate: &Candidate<'_>) -> bool {
138        self.is_match(candidate.path())
139    }
140
141    pub fn matches(&self, path: impl AsRef<str>) -> Vec<usize> {
142        let mut result = Vec::new();
143        self.matches_into(path, &mut result);
144        result
145    }
146
147    pub fn matches_into(&self, path: impl AsRef<str>, into: &mut Vec<usize>) {
148        let path = path.as_ref();
149        if self.patterns.is_empty() {
150            return;
151        }
152
153        let mut seen = vec![false; self.patterns.len()];
154
155        if let Some(ext) = strategy::path_extension(path) {
156            if let Some(indices) = self.ext_any.get(ext) {
157                for &idx in indices {
158                    if !seen[idx] {
159                        into.push(idx);
160                        seen[idx] = true;
161                    }
162                }
163            }
164            if let Some(indices) = self.ext_local.get(ext) {
165                for &idx in indices {
166                    if !seen[idx] && glob_matcher::glob_match(self.patterns[idx].glob(), path) {
167                        into.push(idx);
168                        seen[idx] = true;
169                    }
170                }
171            }
172        }
173
174        if let Some(&idx) = self.literals.get(path)
175            && !seen[idx]
176        {
177            into.push(idx);
178            seen[idx] = true;
179        }
180
181        for (prefix, idx) in &self.prefixes {
182            if !seen[*idx] && path.starts_with(prefix.as_str()) {
183                into.push(*idx);
184                seen[*idx] = true;
185            }
186        }
187
188        for (suffix, idx) in &self.suffixes {
189            if !seen[*idx] && (path.ends_with(suffix.as_str()) || path == &suffix[1..]) {
190                into.push(*idx);
191                seen[*idx] = true;
192            }
193        }
194
195        for (suffix, idx) in &self.compound_suffixes {
196            if !seen[*idx] && path.ends_with(suffix.as_str()) {
197                into.push(*idx);
198                seen[*idx] = true;
199            }
200        }
201
202        for (prefix, suffix, idx) in &self.prefix_suffixes {
203            if !seen[*idx] && path.starts_with(prefix.as_str()) && path.ends_with(suffix.as_str()) {
204                into.push(*idx);
205                seen[*idx] = true;
206            }
207        }
208
209        for &idx in &self.always_check {
210            if !seen[idx] && glob_matcher::glob_match(self.patterns[idx].glob(), path) {
211                into.push(idx);
212                seen[idx] = true;
213            }
214        }
215
216        if let Some(ac) = &self.ac {
217            for mat in ac.find_overlapping_iter(path) {
218                let glob_idx = self.ac_to_glob[mat.pattern().as_usize()];
219                if !seen[glob_idx] && glob_matcher::glob_match(self.patterns[glob_idx].glob(), path)
220                {
221                    into.push(glob_idx);
222                    seen[glob_idx] = true;
223                }
224            }
225        }
226    }
227
228    pub fn matches_candidate(&self, candidate: &Candidate<'_>) -> Vec<usize> {
229        self.matches(candidate.path())
230    }
231
232    pub fn matches_candidate_into(&self, candidate: &Candidate<'_>, into: &mut Vec<usize>) {
233        self.matches_into(candidate.path(), into);
234    }
235}
236
237/// A builder for constructing a [`TinyGlobSet`].
238#[derive(Clone, Debug, Default)]
239pub struct TinyGlobSetBuilder {
240    patterns: Vec<Glob>,
241}
242
243impl TinyGlobSetBuilder {
244    pub fn new() -> Self {
245        Self::default()
246    }
247
248    pub fn add(&mut self, glob: Glob) -> &mut Self {
249        self.patterns.push(glob);
250        self
251    }
252
253    pub fn build(&self) -> Result<TinyGlobSet, crate::error::Error> {
254        let pat_strs: Vec<&str> = self.patterns.iter().map(Glob::glob).collect();
255        let strats = strategy::build(&pat_strs);
256
257        let mut ac_patterns: Vec<String> = Vec::new();
258        let mut ac_to_glob: Vec<usize> = Vec::new();
259        let mut always_check: Vec<usize> = Vec::new();
260
261        for &idx in &strats.glob_indices {
262            match literal::extract_literal(self.patterns[idx].glob()) {
263                Some(lit) => {
264                    ac_patterns.push(String::from(lit));
265                    ac_to_glob.push(idx);
266                }
267                None => {
268                    always_check.push(idx);
269                }
270            }
271        }
272
273        let ac =
274            if ac_patterns.is_empty() {
275                None
276            } else {
277                Some(AhoCorasick::builder().build(&ac_patterns).map_err(|_| {
278                    crate::error::Error::new(crate::error::ErrorKind::UnclosedClass)
279                })?)
280            };
281
282        Ok(TinyGlobSet {
283            patterns: self.patterns.clone(),
284            ext_any: strats.ext_any,
285            ext_local: strats.ext_local,
286            literals: strats.literals,
287            prefixes: strats.prefixes,
288            suffixes: strats.suffixes,
289            compound_suffixes: strats.compound_suffixes,
290            prefix_suffixes: strats.prefix_suffixes,
291            ac,
292            ac_to_glob,
293            always_check,
294        })
295    }
296}