tre_regex/
approx.rs

1use std::borrow::Cow;
2use std::ffi::c_int;
3use std::hint::unreachable_unchecked;
4
5use crate::{
6    err::{BindingErrorCode, ErrorKind, RegexError, Result},
7    tre, Regex, RegexecFlags,
8};
9
10pub type RegApproxMatchStr<'a> = RegApproxMatch<&'a str, Result<Cow<'a, str>>>;
11pub type RegApproxMatchBytes<'a> = RegApproxMatch<&'a [u8], Cow<'a, [u8]>>;
12
13/// Regex params passed to approximate matching functions such as [`regaexec`]
14#[cfg(feature = "approx")]
15#[derive(Copy, Clone, Debug)]
16pub struct RegApproxParams(tre::regaparams_t);
17
18impl RegApproxParams {
19    /// Creates a new empty [`RegApproxParams`] object.
20    #[must_use]
21    #[inline]
22    pub fn new() -> Self {
23        Self(tre::regaparams_t::default())
24    }
25
26    /// Sets the [`cost_ins`](tre_regex_sys::regaparams_t::cost_ins) element.
27    #[must_use]
28    #[inline]
29    pub const fn cost_ins(&self, cost_ins: c_int) -> Self {
30        let mut copy = *self;
31        copy.0.cost_ins = cost_ins;
32        copy
33    }
34
35    /// Sets the [`cost_del`](tre_regex_sys::regaparams_t::cost_del) element.
36    #[must_use]
37    #[inline]
38    pub const fn cost_del(&self, cost_del: c_int) -> Self {
39        let mut copy = *self;
40        copy.0.cost_del = cost_del;
41        copy
42    }
43
44    /// Sets the [`cost_subst`](tre_regex_sys::regaparams_t::cost_subst) element.
45    #[must_use]
46    #[inline]
47    pub const fn cost_subst(&self, cost_subst: c_int) -> Self {
48        let mut copy = *self;
49        copy.0.cost_subst = cost_subst;
50        copy
51    }
52
53    /// Sets the [`max_cost`](tre_regex_sys::regaparams_t::max_cost) element.
54    #[must_use]
55    #[inline]
56    pub const fn max_cost(&self, max_cost: c_int) -> Self {
57        let mut copy = *self;
58        copy.0.max_cost = max_cost;
59        copy
60    }
61
62    /// Sets the [`max_ins`](tre_regex_sys::regaparams_t::max_ins) element.
63    #[must_use]
64    #[inline]
65    pub const fn max_ins(&self, max_ins: c_int) -> Self {
66        let mut copy = *self;
67        copy.0.max_ins = max_ins;
68        copy
69    }
70
71    /// Sets the [`max_del`](tre_regex_sys::regaparams_t::max_del) element.
72    #[must_use]
73    #[inline]
74    pub const fn max_del(&self, max_del: c_int) -> Self {
75        let mut copy = *self;
76        copy.0.max_del = max_del;
77        copy
78    }
79
80    /// Sets the [`max_subst`](tre_regex_sys::regaparams_t::max_subst) element.
81    #[must_use]
82    #[inline]
83    pub const fn max_subst(&self, max_subst: c_int) -> Self {
84        let mut copy = *self;
85        copy.0.max_subst = max_subst;
86        copy
87    }
88
89    /// Sets the [`max_err`](tre_regex_sys::regaparams_t::max_err) element.
90    #[must_use]
91    #[inline]
92    pub const fn max_err(&self, max_err: c_int) -> Self {
93        let mut copy = *self;
94        copy.0.max_err = max_err;
95        copy
96    }
97
98    /// Get an immutable reference to the underlying [`regaparams_t`](tre_regex_sys::regaparams_t) object.
99    #[must_use]
100    #[inline]
101    pub const fn get(&self) -> &tre::regaparams_t {
102        &self.0
103    }
104
105    /// Get a mutable reference to the underlying [`regaparams_t`](tre_regex_sys::regaparams_t) object.
106    #[must_use]
107    #[inline]
108    pub fn get_mut(&mut self) -> &mut tre::regaparams_t {
109        &mut self.0
110    }
111}
112
113impl Default for RegApproxParams {
114    fn default() -> Self {
115        Self::new()
116    }
117}
118
119/// This struct is returned by [`regaexec`] and friends.
120///
121/// The match results from this function are very complex. See the [TRE documentation] for details
122/// on how this all works and corresponding fields, and what they mean.
123///
124/// This structure should never be instantiated outside the library.
125///
126/// [TRE documentation]: <https://laurikari.net/tre/documentation/regaexec/>
127#[derive(Clone, Debug)]
128pub struct RegApproxMatch<Data, Res> {
129    data: Data,
130    matches: Vec<Option<Res>>,
131    amatch: tre::regamatch_t,
132}
133
134impl<Data, Res> RegApproxMatch<Data, Res> {
135    pub(crate) fn new(data: Data, matches: Vec<Option<Res>>, amatch: tre::regamatch_t) -> Self {
136        Self {
137            data,
138            matches,
139            amatch,
140        }
141    }
142
143    /// Gets the cost of the match
144    pub const fn cost(&self) -> c_int {
145        self.amatch.cost
146    }
147
148    /// Gets the number of insertions if the match
149    pub const fn num_ins(&self) -> c_int {
150        self.amatch.num_ins
151    }
152
153    /// Gets the number of deletions if the match
154    pub const fn num_del(&self) -> c_int {
155        self.amatch.num_del
156    }
157
158    /// Get the number of substitutions in the match
159    pub const fn num_subst(&self) -> c_int {
160        self.amatch.num_subst
161    }
162
163    /// Gets an immutable reference to the underlying data
164    pub const fn get_orig_data(&self) -> &Data {
165        &self.data
166    }
167
168    /// Gets the matches returned by this, as references to the data
169    pub const fn get_matches(&self) -> &Vec<Option<Res>> {
170        &self.matches
171    }
172
173    /// Gets a reference to the underlying [`regamatch_t`](tre_regex_sys::regamatch_t) object.
174    pub const fn get_regamatch(&self) -> &tre::regamatch_t {
175        &self.amatch
176    }
177}
178
179impl Regex {
180    /// Performs an approximate regex search on the passed string, returning `nmatches` results.
181    ///
182    /// Non-matching subexpressions or patterns will return `None` in the results.
183    ///
184    /// # Arguments
185    /// * `string`: string to match against `compiled_reg`
186    /// * `params`: see [`RegApproxParams`]
187    /// * `nmatches`: number of matches to return
188    /// * `flags`: [`RegexecFlags`] to pass to [`tre_reganexec`](tre_regex_sys::tre_reganexec).
189    ///
190    /// # Returns
191    /// If no error was found, a [`Vec`] of [`Option`]s will be returned.
192    ///
193    /// If a given match index is empty, The `Option` will be `None`. Otherwise, [`Result`]s will be
194    /// returned, containing either errors or substrings of the matches. Errors may be returned due to
195    /// decoding problems, such as split codepoints.
196    ///
197    /// # Errors
198    /// If an error is encountered during matching, it returns a [`RegexError`]. Match results may also
199    /// return errors, if decoding into UTF-8 was unsuccessful for whatever reason.
200    ///
201    /// # Caveats
202    /// Unless copied, the match results must live at least as long as `string`. This is because they are
203    /// slices into `string` under the hood, for efficiency.
204    ///
205    /// # Examples
206    /// ```
207    /// # use tre_regex::Result;
208    /// # fn main() -> Result<()> {
209    /// use tre_regex::{RegcompFlags, RegexecFlags, RegApproxParams, Regex};
210    ///
211    /// let regcomp_flags = RegcompFlags::new()
212    ///     .add(RegcompFlags::EXTENDED)
213    ///     .add(RegcompFlags::ICASE);
214    /// let regaexec_flags = RegexecFlags::new().add(RegexecFlags::NONE);
215    /// let regaexec_params = RegApproxParams::new()
216    ///     .cost_ins(1)
217    ///     .cost_del(1)
218    ///     .cost_subst(1)
219    ///     .max_cost(2)
220    ///     .max_del(2)
221    ///     .max_ins(2)
222    ///     .max_subst(2)
223    ///     .max_err(2);
224    ///
225    /// let compiled_reg = Regex::new("^(hello).*(world)$", regcomp_flags)?;
226    /// let result = compiled_reg.regaexec(
227    ///     "hello world",      // String to match against
228    ///     &regaexec_params,   // Matching parameters
229    ///     3,                  // Number of matches we want
230    ///     regaexec_flags      // Flags
231    /// )?;
232    ///
233    /// for (i, matched) in result.get_matches().into_iter().enumerate() {
234    ///     match matched {
235    ///         Some(substr) => println!("Match {i}: {}", substr.as_ref().unwrap()),
236    ///         None => println!("Match {i}: <None>"),
237    ///     }
238    /// }
239    /// # Ok(())
240    /// # }
241    /// ```
242    #[inline]
243    pub fn regaexec<'a>(
244        &self,
245        string: &'a str,
246        params: &RegApproxParams,
247        nmatches: usize,
248        flags: RegexecFlags,
249    ) -> Result<RegApproxMatchStr<'a>> {
250        let data = string.as_bytes();
251        let match_results = self.regaexec_bytes(data, params, nmatches, flags)?;
252
253        let mut result: Vec<Option<Result<Cow<'a, str>>>> = Vec::with_capacity(nmatches);
254        for pmatch in match_results.get_matches() {
255            let Some(pmatch) = pmatch else {
256                result.push(None);
257                continue;
258            };
259
260            #[allow(clippy::match_wildcard_for_single_variants)]
261            result.push(Some(match pmatch {
262                Cow::Borrowed(pmatch) => match std::str::from_utf8(pmatch) {
263                    Ok(s) => Ok(s.into()),
264                    Err(e) => Err(RegexError::new(
265                        ErrorKind::Binding(BindingErrorCode::ENCODING),
266                        &format!("UTF-8 encoding error: {e}"),
267                    )),
268                },
269                // SAFETY: cannot get here, we only have borrowed values.
270                _ => unsafe { unreachable_unchecked() },
271            }));
272        }
273
274        Ok(RegApproxMatchStr::new(
275            string,
276            result,
277            *match_results.get_regamatch(),
278        ))
279    }
280
281    /// Performs an approximate regex search on the passed bytes, returning `nmatches` results.
282    ///
283    /// This function should only be used if you need to match raw bytes, or bytes which may not be
284    /// UTF-8 compliant. Otherwise, [`regaexec`] is recommended instead.
285    ///
286    /// # Arguments
287    /// * `data`: [`u8`] slice to match against `compiled_reg`
288    /// * `params`: see [`RegApproxParams`]
289    /// * `nmatches`: number of matches to return
290    /// * `flags`: [`RegexecFlags`] to pass to [`tre_reganexec`](tre_regex_sys::tre_reganexec).
291    ///
292    /// # Returns
293    /// If no error was found, a [`Vec`] of [`Option`]s will be returned.
294    ///
295    /// If a given match index is empty, The `Option` will be `None`. Otherwise, [`u8`] slices will be
296    /// returned.
297    ///
298    /// # Errors
299    /// If an error is encountered during matching, it returns a [`RegexError`].
300    ///
301    /// # Caveats
302    /// Unless copied, the match results must live at least as long as `data`. This is because they are
303    /// slices into `data` under the hood, for efficiency.
304    ///
305    /// # Examples
306    /// ```
307    /// # use tre_regex::Result;
308    /// # fn main() -> Result<()> {
309    /// use tre_regex::{RegcompFlags, RegexecFlags, RegApproxParams, Regex};
310    ///
311    /// let regcomp_flags = RegcompFlags::new()
312    ///     .add(RegcompFlags::EXTENDED)
313    ///     .add(RegcompFlags::ICASE);
314    /// let regaexec_flags = RegexecFlags::new().add(RegexecFlags::NONE);
315    /// let regaexec_params = RegApproxParams::new()
316    ///     .cost_ins(1)
317    ///     .cost_del(1)
318    ///     .cost_subst(1)
319    ///     .max_cost(2)
320    ///     .max_del(2)
321    ///     .max_ins(2)
322    ///     .max_subst(2)
323    ///     .max_err(2);
324    ///
325    /// let compiled_reg = Regex::new("^(hello).*(world)$", regcomp_flags)?;
326    /// let result = compiled_reg.regaexec_bytes(
327    ///     b"hello world",     // Bytes to match against
328    ///     &regaexec_params,   // Matching parameters
329    ///     3,                  // Number of matches we want
330    ///     regaexec_flags      // Flags
331    /// )?;
332    ///
333    /// for (i, matched) in result.get_matches().into_iter().enumerate() {
334    ///     match matched {
335    ///         Some(substr) => println!(
336    ///             "Match {i}: {}",
337    ///             std::str::from_utf8(substr).unwrap()
338    ///         ),
339    ///         None => println!("Match {i}: <None>"),
340    ///     }
341    /// }
342    /// # Ok(())
343    /// # }
344    /// ```
345    pub fn regaexec_bytes<'a>(
346        &self,
347        data: &'a [u8],
348        params: &RegApproxParams,
349        nmatches: usize,
350        flags: RegexecFlags,
351    ) -> Result<RegApproxMatchBytes<'a>> {
352        let Some(compiled_reg_obj) = self.get() else {
353            return Err(RegexError::new(
354                ErrorKind::Binding(BindingErrorCode::REGEX_VACANT),
355                "Attempted to unwrap a vacant Regex object",
356            ));
357        };
358        let mut match_vec: Vec<tre::regmatch_t> =
359            vec![tre::regmatch_t { rm_so: 0, rm_eo: 0 }; nmatches];
360        let mut amatch = tre::regamatch_t {
361            nmatch: nmatches,
362            pmatch: match_vec.as_mut_ptr(),
363            ..Default::default()
364        };
365
366        // SAFETY: compiled_reg is a wrapped type (see safety concerns for Regex). data is read-only.
367        // match_vec has enough room for everything. flags also cannot wrap around.
368        #[allow(clippy::cast_possible_wrap)]
369        let result = unsafe {
370            tre::tre_reganexec(
371                compiled_reg_obj,
372                data.as_ptr().cast::<i8>(),
373                data.len(),
374                &mut amatch,
375                *params.get(),
376                flags.get(),
377            )
378        };
379        if result != 0 {
380            return Err(self.regerror(result));
381        }
382
383        let mut result: Vec<Option<Cow<'a, [u8]>>> = Vec::with_capacity(nmatches);
384        for pmatch in match_vec {
385            if pmatch.rm_so < 0 || pmatch.rm_eo < 0 {
386                result.push(None);
387                continue;
388            }
389
390            // Wraparound is impossible.
391            #[allow(clippy::cast_sign_loss)]
392            let start_offset = pmatch.rm_so as usize;
393            #[allow(clippy::cast_sign_loss)]
394            let end_offset = pmatch.rm_eo as usize;
395
396            result.push(Some(Cow::Borrowed(&data[start_offset..end_offset])));
397        }
398
399        Ok(RegApproxMatchBytes::new(data, result, amatch))
400    }
401}
402
403/// Performs an approximate regex search on the passed string, returning `nmatches` results.
404///
405/// This is a thin wrapper around [`Regex::regaexec`].
406///
407/// Non-matching subexpressions or patterns will return `None` in the results.
408///
409/// # Arguments
410/// * `compiled_reg`: the compiled [`Regex`] object.
411/// * `string`: string to match against `compiled_reg`
412/// * `params`: see [`RegApproxParams`]
413/// * `nmatches`: number of matches to return
414/// * `flags`: [`RegexecFlags`] to pass to [`tre_regnexec`](tre_regex_sys::tre_regnexec).
415///
416/// # Returns
417/// If no error was found, a [`Vec`] of [`Option`]s will be returned.
418///
419/// If a given match index is empty, The `Option` will be `None`. Otherwise, [`Result`]s will be
420/// returned, containing either errors or substrings of the matches. Errors may be returned due to
421/// decoding problems, such as split codepoints.
422///
423/// # Errors
424/// If an error is encountered during matching, it returns a [`RegexError`]. Match results may also
425/// return errors, if decoding into UTF-8 was unsuccessful for whatever reason.
426///
427/// # Caveats
428/// Unless copied, the match results must live at least as long as `string`. This is because they are
429/// slices into `string` under the hood, for efficiency.
430///
431/// # Examples
432/// ```
433/// # use tre_regex::Result;
434/// # fn main() -> Result<()> {
435/// use tre_regex::{RegcompFlags, RegexecFlags, RegApproxParams, Regex, regaexec};
436///
437/// let regcomp_flags = RegcompFlags::new()
438///     .add(RegcompFlags::EXTENDED)
439///     .add(RegcompFlags::ICASE);
440/// let regaexec_flags = RegexecFlags::new().add(RegexecFlags::NONE);
441/// let regaexec_params = RegApproxParams::new()
442///     .cost_ins(1)
443///     .cost_del(1)
444///     .cost_subst(1)
445///     .max_cost(2)
446///     .max_del(2)
447///     .max_ins(2)
448///     .max_subst(2)
449///     .max_err(2);
450///
451/// let compiled_reg = Regex::new("^(hello).*(world)$", regcomp_flags)?;
452/// let result = regaexec(
453///     &compiled_reg,      // Compiled regex
454///     "hello world",      // String to match against
455///     &regaexec_params,   // Matching parameters
456///     3,                  // Number of matches we want
457///     regaexec_flags      // Flags
458/// )?;
459///
460/// for (i, matched) in result.get_matches().into_iter().enumerate() {
461///     match matched {
462///         Some(substr) => println!("Match {i}: {}", substr.as_ref().unwrap()),
463///         None => println!("Match {i}: <None>"),
464///     }
465/// }
466/// # Ok(())
467/// # }
468/// ```
469#[inline]
470pub fn regaexec<'a>(
471    compiled_reg: &Regex,
472    string: &'a str,
473    params: &RegApproxParams,
474    nmatches: usize,
475    flags: RegexecFlags,
476) -> Result<RegApproxMatchStr<'a>> {
477    compiled_reg.regaexec(string, params, nmatches, flags)
478}
479
480/// Performs an approximate regex search on the passed bytes, returning `nmatches` results.
481///
482/// This is a thin wrapper around [`Regex::regaexec_bytes`].
483///
484/// This function should only be used if you need to match raw bytes, or bytes which may not be
485/// UTF-8 compliant. Otherwise, [`regaexec`] is recommended instead.
486///
487/// # Arguments
488/// * `compiled_reg`: the compiled [`Regex`] object.
489/// * `data`: [`u8`] slice to match against `compiled_reg`
490/// * `params`: see [`RegApproxParams`]
491/// * `nmatches`: number of matches to return
492/// * `flags`: [`RegexecFlags`] to pass to [`tre_regnexec`](tre_regex_sys::tre_regnexec).
493///
494/// # Returns
495/// If no error was found, a [`Vec`] of [`Option`]s will be returned.
496///
497/// If a given match index is empty, The `Option` will be `None`. Otherwise, [`u8`] slices will be
498/// returned.
499///
500/// # Errors
501/// If an error is encountered during matching, it returns a [`RegexError`].
502///
503/// # Caveats
504/// Unless copied, the match results must live at least as long as `data`. This is because they are
505/// slices into `data` under the hood, for efficiency.
506///
507/// # Examples
508/// ```
509/// # use tre_regex::Result;
510/// # fn main() -> Result<()> {
511/// use tre_regex::{RegcompFlags, RegexecFlags, RegApproxParams, Regex, regaexec_bytes};
512///
513/// let regcomp_flags = RegcompFlags::new()
514///     .add(RegcompFlags::EXTENDED)
515///     .add(RegcompFlags::ICASE);
516/// let regaexec_flags = RegexecFlags::new().add(RegexecFlags::NONE);
517/// let regaexec_params = RegApproxParams::new()
518///     .cost_ins(1)
519///     .cost_del(1)
520///     .cost_subst(1)
521///     .max_cost(2)
522///     .max_del(2)
523///     .max_ins(2)
524///     .max_subst(2)
525///     .max_err(2);
526///
527/// let compiled_reg = Regex::new("^(hello).*(world)$", regcomp_flags)?;
528/// let result = regaexec_bytes(
529///     &compiled_reg,      // Compiled regex
530///     b"hello world",     // Bytes to match against
531///     &regaexec_params,   // Matching parameters
532///     3,                  // Number of matches we want
533///     regaexec_flags      // Flags
534/// )?;
535///
536/// for (i, matched) in result.get_matches().into_iter().enumerate() {
537///     match matched {
538///         Some(substr) => println!(
539///             "Match {i}: {}",
540///             std::str::from_utf8(substr).unwrap()
541///         ),
542///         None => println!("Match {i}: <None>"),
543///     }
544/// }
545/// # Ok(())
546/// # }
547/// ```
548#[inline]
549pub fn regaexec_bytes<'a>(
550    compiled_reg: &Regex,
551    data: &'a [u8],
552    params: &RegApproxParams,
553    nmatches: usize,
554    flags: RegexecFlags,
555) -> Result<RegApproxMatchBytes<'a>> {
556    compiled_reg.regaexec_bytes(data, params, nmatches, flags)
557}