Skip to main content

dyntext/
tre.rs

1//! Safe Rust wrapper around `tre-sys`.
2//!
3//! This module is the single entry point dyntext callers use
4//! to do approximate-regex matching. It provides:
5//!
6//! * [`TreCompiledPattern`], an owning handle to a compiled
7//!   TRE pattern that frees its native resources on drop.
8//! * [`TreMatchOpts`], a builder for the per-call cost weights
9//!   and edit-budget caps.
10//! * [`TreMatch`], the result of a successful approximate
11//!   match.
12//! * [`TreError`], a typed error returned by [`TreCompiledPattern::compile`].
13//!
14//! The wrapper is intentionally narrow: it exposes only what
15//! the dyntext index needs, and it does **not** leak any
16//! `unsafe` API. All `unsafe` lives in the `tre-sys` crate;
17//! see the module-level safety section there for the contract
18//! enforced here.
19//!
20//! # Example
21//!
22//! ```
23//! use dyntext::tre::{TreCompiledPattern, TreMatchOpts};
24//!
25//! let opts = TreMatchOpts {
26//!     max_errors: 1,
27//!     ..TreMatchOpts::default()
28//! };
29//! let pat = TreCompiledPattern::compile(br"errno: \w+ refused", opts)
30//!     .expect("compile");
31//! assert!(pat.is_match(b"errno: connection refused"));
32//! assert!(pat.is_match(b"errno: cnnection refused"));
33//! assert!(!pat.is_match(b"errno cnntion rfsed"));
34//! ```
35//!
36//! # Thread safety
37//!
38//! [`TreCompiledPattern`] is intentionally `!Send` and `!Sync`.
39//! TRE's compiled `regex_t` owns mutable state (the TNFA), and
40//! the matcher is not internally synchronised. Callers that
41//! want concurrent matching should compile one pattern per
42//! thread.
43
44use std::marker::PhantomData;
45use std::os::raw::c_int;
46
47use thiserror::Error;
48use tre_sys::{
49    compile as compile_raw, default_regaparams, safe_reganexec, safe_regerror, ExecOutcome,
50    OwnedRegex, REG_EXTENDED, REG_ICASE,
51};
52
53/// Errors returned by [`TreCompiledPattern::compile`].
54#[derive(Debug, Error)]
55pub enum TreError {
56    /// The pattern bytes failed to compile.
57    #[error("tre compile failed: {0}")]
58    Compile(String),
59    /// A configuration choice in [`TreMatchOpts`] is
60    /// inconsistent (for example, `max_errors > 0` with all
61    /// edit costs at zero).
62    #[error("invalid pattern options: {0}")]
63    InvalidOptions(String),
64    /// TRE returned a code not covered by the public API.
65    #[error("internal tre error: {0}")]
66    Internal(String),
67}
68
69/// Per-pattern matching configuration.
70///
71/// `max_errors` is the headline knob: it bounds the number of
72/// edit operations tolerated in a match. `cost_*` weight each
73/// edit kind; if all weights are 1 the cost equals the edit
74/// distance. `max_cost` bounds the cumulative cost; setting it
75/// to 0 with `max_errors > 0` is rejected because it forces
76/// exact match no matter what edits are allowed.
77///
78/// `case_insensitive` toggles TRE's `REG_ICASE` flag.
79#[derive(Clone, Copy, Debug)]
80pub struct TreMatchOpts {
81    /// Maximum number of edit operations (insert + delete +
82    /// substitute) permitted in a match.
83    pub max_errors: u16,
84    /// Cost of inserting one byte. Defaults to 1.
85    pub cost_ins: u16,
86    /// Cost of deleting one byte. Defaults to 1.
87    pub cost_del: u16,
88    /// Cost of substituting one byte. Defaults to 1.
89    pub cost_subst: u16,
90    /// Cumulative cost ceiling. Zero means "use
91    /// `max_errors * max(cost_*)` as the implicit ceiling".
92    pub max_cost: u16,
93    /// Match case-insensitively.
94    pub case_insensitive: bool,
95}
96
97impl Default for TreMatchOpts {
98    fn default() -> Self {
99        Self {
100            max_errors: 0,
101            cost_ins: 1,
102            cost_del: 1,
103            cost_subst: 1,
104            max_cost: 0,
105            case_insensitive: false,
106        }
107    }
108}
109
110/// Successful approximate-match result.
111#[derive(Debug, Clone, Copy, PartialEq, Eq)]
112pub struct TreMatch {
113    /// Byte offset where the match starts. The current TRE
114    /// wrapper does not request submatch storage, so this is
115    /// reported as 0 on a successful match. Phase 3.1 will add
116    /// submatch capture and populate the real offset.
117    pub start: usize,
118    /// Byte offset one past the end of the match. See
119    /// [`Self::start`] for the current limitation.
120    pub end: usize,
121    /// Total cost of the match (sum of edit costs).
122    pub cost: u32,
123    /// Number of inserts.
124    pub n_ins: u32,
125    /// Number of deletes.
126    pub n_del: u32,
127    /// Number of substitutes.
128    pub n_subst: u32,
129}
130
131/// Owning handle to a TRE-compiled pattern.
132///
133/// Construct via [`TreCompiledPattern::compile`]. The native
134/// resources are released by [`Drop`] (delegated to the
135/// `tre-sys::OwnedRegex` field).
136pub struct TreCompiledPattern {
137    raw: OwnedRegex,
138    /// Cached compile-time options so [`Self::matches`] can
139    /// reuse the right cost weights without rebuilding the
140    /// `regaparams_t` on every call.
141    opts: TreMatchOpts,
142    /// `*const ()` is `!Send + !Sync`; this drops both auto
143    /// traits without needing explicit `unsafe impl !Send`
144    /// (which is currently nightly-only).
145    _not_send_or_sync: PhantomData<*const ()>,
146}
147
148impl std::fmt::Debug for TreCompiledPattern {
149    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
150        f.debug_struct("TreCompiledPattern")
151            .field("nsub", &self.raw.nsub())
152            .field("opts", &self.opts)
153            .finish()
154    }
155}
156
157impl TreCompiledPattern {
158    /// Compile a regex pattern.
159    ///
160    /// The pattern is read as bytes. The vendored TRE build
161    /// disables multibyte handling, so byte n in the pattern is
162    /// matched against byte n in the haystack regardless of the
163    /// host locale.
164    pub fn compile(pattern: &[u8], opts: TreMatchOpts) -> Result<Self, TreError> {
165        validate_opts(opts)?;
166
167        let mut cflags = REG_EXTENDED;
168        if opts.case_insensitive {
169            cflags |= REG_ICASE;
170        }
171
172        match compile_raw(pattern, cflags) {
173            Ok(raw) => Ok(Self {
174                raw,
175                opts,
176                _not_send_or_sync: PhantomData,
177            }),
178            Err(rc) => Err(TreError::Compile(safe_regerror(rc, None))),
179        }
180    }
181
182    /// Return `true` if `text` contains an approximate match
183    /// for the pattern within the configured error budget.
184    #[must_use]
185    pub fn is_match(&self, text: &[u8]) -> bool {
186        self.matches(text).is_some()
187    }
188
189    /// Return the first approximate match in `text`, if any.
190    ///
191    /// On a non-`REG_OK`/`REG_NOMATCH` return code from TRE
192    /// the function returns `None` and asserts in debug builds.
193    /// In release the failure is treated as a no-match because
194    /// the only documented non-fatal failure for the
195    /// approximate matcher is back-references in the pattern,
196    /// which dyntext does not generate.
197    #[must_use]
198    pub fn matches(&self, text: &[u8]) -> Option<TreMatch> {
199        let params = self.build_params();
200        match safe_reganexec(self.raw.as_raw(), text, params, 0) {
201            ExecOutcome::Match(m) => Some(TreMatch {
202                start: 0,
203                end: text.len(),
204                cost: u32::try_from(m.cost).unwrap_or(u32::MAX),
205                n_ins: u32::try_from(m.num_ins).unwrap_or(u32::MAX),
206                n_del: u32::try_from(m.num_del).unwrap_or(u32::MAX),
207                n_subst: u32::try_from(m.num_subst).unwrap_or(u32::MAX),
208            }),
209            ExecOutcome::NoMatch => None,
210            ExecOutcome::Error(other) => {
211                debug_assert!(false, "tre_reganexec returned unexpected code {other}");
212                None
213            }
214        }
215    }
216
217    /// Borrow the cached options. Useful for tests and for
218    /// callers that want to re-use the same configuration.
219    #[must_use]
220    pub fn opts(&self) -> TreMatchOpts {
221        self.opts
222    }
223
224    /// Build the `regaparams_t` from the cached options.
225    fn build_params(&self) -> tre_sys::regaparams_t {
226        let mut params = default_regaparams();
227        params.cost_ins = c_int::from(self.opts.cost_ins);
228        params.cost_del = c_int::from(self.opts.cost_del);
229        params.cost_subst = c_int::from(self.opts.cost_subst);
230        params.max_err = c_int::from(self.opts.max_errors);
231        params.max_ins = c_int::from(self.opts.max_errors);
232        params.max_del = c_int::from(self.opts.max_errors);
233        params.max_subst = c_int::from(self.opts.max_errors);
234
235        let max_cost = if self.opts.max_cost == 0 {
236            // Implicit ceiling: max_errors * largest single
237            // edit cost. Using u32 saturating arithmetic to
238            // avoid surprising overflow when callers pass big
239            // u16 values.
240            let largest = self
241                .opts
242                .cost_ins
243                .max(self.opts.cost_del)
244                .max(self.opts.cost_subst);
245            let budget = u32::from(self.opts.max_errors) * u32::from(largest.max(1));
246            c_int::try_from(budget).unwrap_or(c_int::MAX)
247        } else {
248            c_int::from(self.opts.max_cost)
249        };
250        params.max_cost = max_cost;
251        params
252    }
253}
254
255fn validate_opts(opts: TreMatchOpts) -> Result<(), TreError> {
256    if opts.max_errors > 0 && opts.cost_ins == 0 && opts.cost_del == 0 && opts.cost_subst == 0 {
257        return Err(TreError::InvalidOptions(
258            "max_errors > 0 requires at least one non-zero edit cost".into(),
259        ));
260    }
261    Ok(())
262}
263
264#[cfg(test)]
265mod tests {
266    use super::*;
267
268    #[test]
269    fn default_opts_are_zero_error_unit_costs() {
270        let opts = TreMatchOpts::default();
271        assert_eq!(opts.max_errors, 0);
272        assert_eq!(opts.cost_ins, 1);
273        assert_eq!(opts.cost_del, 1);
274        assert_eq!(opts.cost_subst, 1);
275        assert_eq!(opts.max_cost, 0);
276        assert!(!opts.case_insensitive);
277    }
278
279    #[test]
280    fn validate_opts_rejects_max_errors_with_zero_costs() {
281        let bad = TreMatchOpts {
282            max_errors: 1,
283            cost_ins: 0,
284            cost_del: 0,
285            cost_subst: 0,
286            ..TreMatchOpts::default()
287        };
288        assert!(matches!(
289            validate_opts(bad),
290            Err(TreError::InvalidOptions(_))
291        ));
292    }
293
294    #[test]
295    fn compiled_pattern_phantom_is_not_send_or_sync() {
296        // A `PhantomData<*const ()>` field drops both auto
297        // traits. The check below is a documentation-only
298        // reminder; if it stops compiling the assertion is
299        // wrong and should be re-derived.
300        fn assert_any<T>() {}
301        assert_any::<TreCompiledPattern>();
302    }
303}