dyntext/tre.rs
1//! Safe Rust wrapper around `tre-sys`.
2//!
3//! This module is the single entry point dyntext callers use
4//! to do approximate-regex matching. It provides:
5//!
6//! * [`TreCompiledPattern`], an owning handle to a compiled
7//! TRE pattern that frees its native resources on drop.
8//! * [`TreMatchOpts`], a builder for the per-call cost weights
9//! and edit-budget caps.
10//! * [`TreMatch`], the result of a successful approximate
11//! match.
12//! * [`TreError`], a typed error returned by [`TreCompiledPattern::compile`].
13//!
14//! The wrapper is intentionally narrow: it exposes only what
15//! the dyntext index needs, and it does **not** leak any
16//! `unsafe` API. All `unsafe` lives in the `tre-sys` crate;
17//! see the module-level safety section there for the contract
18//! enforced here.
19//!
20//! # Example
21//!
22//! ```
23//! use dyntext::tre::{TreCompiledPattern, TreMatchOpts};
24//!
25//! let opts = TreMatchOpts {
26//! max_errors: 1,
27//! ..TreMatchOpts::default()
28//! };
29//! let pat = TreCompiledPattern::compile(br"errno: \w+ refused", opts)
30//! .expect("compile");
31//! assert!(pat.is_match(b"errno: connection refused"));
32//! assert!(pat.is_match(b"errno: cnnection refused"));
33//! assert!(!pat.is_match(b"errno cnntion rfsed"));
34//! ```
35//!
36//! # Thread safety
37//!
38//! [`TreCompiledPattern`] is intentionally `!Send` and `!Sync`.
39//! TRE's compiled `regex_t` owns mutable state (the TNFA), and
40//! the matcher is not internally synchronised. Callers that
41//! want concurrent matching should compile one pattern per
42//! thread.
43
44use std::marker::PhantomData;
45use std::os::raw::c_int;
46
47use thiserror::Error;
48use tre_sys::{
49 compile as compile_raw, default_regaparams, safe_reganexec, safe_regerror, ExecOutcome,
50 OwnedRegex, REG_EXTENDED, REG_ICASE,
51};
52
53/// Errors returned by [`TreCompiledPattern::compile`].
54#[derive(Debug, Error)]
55pub enum TreError {
56 /// The pattern bytes failed to compile.
57 #[error("tre compile failed: {0}")]
58 Compile(String),
59 /// A configuration choice in [`TreMatchOpts`] is
60 /// inconsistent (for example, `max_errors > 0` with all
61 /// edit costs at zero).
62 #[error("invalid pattern options: {0}")]
63 InvalidOptions(String),
64 /// TRE returned a code not covered by the public API.
65 #[error("internal tre error: {0}")]
66 Internal(String),
67}
68
69/// Per-pattern matching configuration.
70///
71/// `max_errors` is the headline knob: it bounds the number of
72/// edit operations tolerated in a match. `cost_*` weight each
73/// edit kind; if all weights are 1 the cost equals the edit
74/// distance. `max_cost` bounds the cumulative cost; setting it
75/// to 0 with `max_errors > 0` is rejected because it forces
76/// exact match no matter what edits are allowed.
77///
78/// `case_insensitive` toggles TRE's `REG_ICASE` flag.
79#[derive(Clone, Copy, Debug)]
80pub struct TreMatchOpts {
81 /// Maximum number of edit operations (insert + delete +
82 /// substitute) permitted in a match.
83 pub max_errors: u16,
84 /// Cost of inserting one byte. Defaults to 1.
85 pub cost_ins: u16,
86 /// Cost of deleting one byte. Defaults to 1.
87 pub cost_del: u16,
88 /// Cost of substituting one byte. Defaults to 1.
89 pub cost_subst: u16,
90 /// Cumulative cost ceiling. Zero means "use
91 /// `max_errors * max(cost_*)` as the implicit ceiling".
92 pub max_cost: u16,
93 /// Match case-insensitively.
94 pub case_insensitive: bool,
95}
96
97impl Default for TreMatchOpts {
98 fn default() -> Self {
99 Self {
100 max_errors: 0,
101 cost_ins: 1,
102 cost_del: 1,
103 cost_subst: 1,
104 max_cost: 0,
105 case_insensitive: false,
106 }
107 }
108}
109
110/// Successful approximate-match result.
111#[derive(Debug, Clone, Copy, PartialEq, Eq)]
112pub struct TreMatch {
113 /// Byte offset where the match starts. The current TRE
114 /// wrapper does not request submatch storage, so this is
115 /// reported as 0 on a successful match. Phase 3.1 will add
116 /// submatch capture and populate the real offset.
117 pub start: usize,
118 /// Byte offset one past the end of the match. See
119 /// [`Self::start`] for the current limitation.
120 pub end: usize,
121 /// Total cost of the match (sum of edit costs).
122 pub cost: u32,
123 /// Number of inserts.
124 pub n_ins: u32,
125 /// Number of deletes.
126 pub n_del: u32,
127 /// Number of substitutes.
128 pub n_subst: u32,
129}
130
131/// Owning handle to a TRE-compiled pattern.
132///
133/// Construct via [`TreCompiledPattern::compile`]. The native
134/// resources are released by [`Drop`] (delegated to the
135/// `tre-sys::OwnedRegex` field).
136pub struct TreCompiledPattern {
137 raw: OwnedRegex,
138 /// Cached compile-time options so [`Self::matches`] can
139 /// reuse the right cost weights without rebuilding the
140 /// `regaparams_t` on every call.
141 opts: TreMatchOpts,
142 /// `*const ()` is `!Send + !Sync`; this drops both auto
143 /// traits without needing explicit `unsafe impl !Send`
144 /// (which is currently nightly-only).
145 _not_send_or_sync: PhantomData<*const ()>,
146}
147
148impl std::fmt::Debug for TreCompiledPattern {
149 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
150 f.debug_struct("TreCompiledPattern")
151 .field("nsub", &self.raw.nsub())
152 .field("opts", &self.opts)
153 .finish()
154 }
155}
156
157impl TreCompiledPattern {
158 /// Compile a regex pattern.
159 ///
160 /// The pattern is read as bytes. The vendored TRE build
161 /// disables multibyte handling, so byte n in the pattern is
162 /// matched against byte n in the haystack regardless of the
163 /// host locale.
164 pub fn compile(pattern: &[u8], opts: TreMatchOpts) -> Result<Self, TreError> {
165 validate_opts(opts)?;
166
167 let mut cflags = REG_EXTENDED;
168 if opts.case_insensitive {
169 cflags |= REG_ICASE;
170 }
171
172 match compile_raw(pattern, cflags) {
173 Ok(raw) => Ok(Self {
174 raw,
175 opts,
176 _not_send_or_sync: PhantomData,
177 }),
178 Err(rc) => Err(TreError::Compile(safe_regerror(rc, None))),
179 }
180 }
181
182 /// Return `true` if `text` contains an approximate match
183 /// for the pattern within the configured error budget.
184 #[must_use]
185 pub fn is_match(&self, text: &[u8]) -> bool {
186 self.matches(text).is_some()
187 }
188
189 /// Return the first approximate match in `text`, if any.
190 ///
191 /// On a non-`REG_OK`/`REG_NOMATCH` return code from TRE
192 /// the function returns `None` and asserts in debug builds.
193 /// In release the failure is treated as a no-match because
194 /// the only documented non-fatal failure for the
195 /// approximate matcher is back-references in the pattern,
196 /// which dyntext does not generate.
197 #[must_use]
198 pub fn matches(&self, text: &[u8]) -> Option<TreMatch> {
199 let params = self.build_params();
200 match safe_reganexec(self.raw.as_raw(), text, params, 0) {
201 ExecOutcome::Match(m) => Some(TreMatch {
202 start: 0,
203 end: text.len(),
204 cost: u32::try_from(m.cost).unwrap_or(u32::MAX),
205 n_ins: u32::try_from(m.num_ins).unwrap_or(u32::MAX),
206 n_del: u32::try_from(m.num_del).unwrap_or(u32::MAX),
207 n_subst: u32::try_from(m.num_subst).unwrap_or(u32::MAX),
208 }),
209 ExecOutcome::NoMatch => None,
210 ExecOutcome::Error(other) => {
211 debug_assert!(false, "tre_reganexec returned unexpected code {other}");
212 None
213 }
214 }
215 }
216
217 /// Borrow the cached options. Useful for tests and for
218 /// callers that want to re-use the same configuration.
219 #[must_use]
220 pub fn opts(&self) -> TreMatchOpts {
221 self.opts
222 }
223
224 /// Build the `regaparams_t` from the cached options.
225 fn build_params(&self) -> tre_sys::regaparams_t {
226 let mut params = default_regaparams();
227 params.cost_ins = c_int::from(self.opts.cost_ins);
228 params.cost_del = c_int::from(self.opts.cost_del);
229 params.cost_subst = c_int::from(self.opts.cost_subst);
230 params.max_err = c_int::from(self.opts.max_errors);
231 params.max_ins = c_int::from(self.opts.max_errors);
232 params.max_del = c_int::from(self.opts.max_errors);
233 params.max_subst = c_int::from(self.opts.max_errors);
234
235 let max_cost = if self.opts.max_cost == 0 {
236 // Implicit ceiling: max_errors * largest single
237 // edit cost. Using u32 saturating arithmetic to
238 // avoid surprising overflow when callers pass big
239 // u16 values.
240 let largest = self
241 .opts
242 .cost_ins
243 .max(self.opts.cost_del)
244 .max(self.opts.cost_subst);
245 let budget = u32::from(self.opts.max_errors) * u32::from(largest.max(1));
246 c_int::try_from(budget).unwrap_or(c_int::MAX)
247 } else {
248 c_int::from(self.opts.max_cost)
249 };
250 params.max_cost = max_cost;
251 params
252 }
253}
254
255fn validate_opts(opts: TreMatchOpts) -> Result<(), TreError> {
256 if opts.max_errors > 0 && opts.cost_ins == 0 && opts.cost_del == 0 && opts.cost_subst == 0 {
257 return Err(TreError::InvalidOptions(
258 "max_errors > 0 requires at least one non-zero edit cost".into(),
259 ));
260 }
261 Ok(())
262}
263
264#[cfg(test)]
265mod tests {
266 use super::*;
267
268 #[test]
269 fn default_opts_are_zero_error_unit_costs() {
270 let opts = TreMatchOpts::default();
271 assert_eq!(opts.max_errors, 0);
272 assert_eq!(opts.cost_ins, 1);
273 assert_eq!(opts.cost_del, 1);
274 assert_eq!(opts.cost_subst, 1);
275 assert_eq!(opts.max_cost, 0);
276 assert!(!opts.case_insensitive);
277 }
278
279 #[test]
280 fn validate_opts_rejects_max_errors_with_zero_costs() {
281 let bad = TreMatchOpts {
282 max_errors: 1,
283 cost_ins: 0,
284 cost_del: 0,
285 cost_subst: 0,
286 ..TreMatchOpts::default()
287 };
288 assert!(matches!(
289 validate_opts(bad),
290 Err(TreError::InvalidOptions(_))
291 ));
292 }
293
294 #[test]
295 fn compiled_pattern_phantom_is_not_send_or_sync() {
296 // A `PhantomData<*const ()>` field drops both auto
297 // traits. The check below is a documentation-only
298 // reminder; if it stops compiling the assertion is
299 // wrong and should be re-derived.
300 fn assert_any<T>() {}
301 assert_any::<TreCompiledPattern>();
302 }
303}