tre_regex/approx.rs
1use std::borrow::Cow;
2use std::ffi::c_int;
3use std::hint::unreachable_unchecked;
4
5use crate::{
6 err::{BindingErrorCode, ErrorKind, RegexError, Result},
7 tre, Regex, RegexecFlags,
8};
9
10pub type RegApproxMatchStr<'a> = RegApproxMatch<&'a str, Result<Cow<'a, str>>>;
11pub type RegApproxMatchBytes<'a> = RegApproxMatch<&'a [u8], Cow<'a, [u8]>>;
12
13/// Regex params passed to approximate matching functions such as [`regaexec`]
14#[cfg(feature = "approx")]
15#[derive(Copy, Clone, Debug)]
16pub struct RegApproxParams(tre::regaparams_t);
17
18impl RegApproxParams {
19 /// Creates a new empty [`RegApproxParams`] object.
20 #[must_use]
21 #[inline]
22 pub fn new() -> Self {
23 Self(tre::regaparams_t::default())
24 }
25
26 /// Sets the [`cost_ins`](tre_regex_sys::regaparams_t::cost_ins) element.
27 #[must_use]
28 #[inline]
29 pub const fn cost_ins(&self, cost_ins: c_int) -> Self {
30 let mut copy = *self;
31 copy.0.cost_ins = cost_ins;
32 copy
33 }
34
35 /// Sets the [`cost_del`](tre_regex_sys::regaparams_t::cost_del) element.
36 #[must_use]
37 #[inline]
38 pub const fn cost_del(&self, cost_del: c_int) -> Self {
39 let mut copy = *self;
40 copy.0.cost_del = cost_del;
41 copy
42 }
43
44 /// Sets the [`cost_subst`](tre_regex_sys::regaparams_t::cost_subst) element.
45 #[must_use]
46 #[inline]
47 pub const fn cost_subst(&self, cost_subst: c_int) -> Self {
48 let mut copy = *self;
49 copy.0.cost_subst = cost_subst;
50 copy
51 }
52
53 /// Sets the [`max_cost`](tre_regex_sys::regaparams_t::max_cost) element.
54 #[must_use]
55 #[inline]
56 pub const fn max_cost(&self, max_cost: c_int) -> Self {
57 let mut copy = *self;
58 copy.0.max_cost = max_cost;
59 copy
60 }
61
62 /// Sets the [`max_ins`](tre_regex_sys::regaparams_t::max_ins) element.
63 #[must_use]
64 #[inline]
65 pub const fn max_ins(&self, max_ins: c_int) -> Self {
66 let mut copy = *self;
67 copy.0.max_ins = max_ins;
68 copy
69 }
70
71 /// Sets the [`max_del`](tre_regex_sys::regaparams_t::max_del) element.
72 #[must_use]
73 #[inline]
74 pub const fn max_del(&self, max_del: c_int) -> Self {
75 let mut copy = *self;
76 copy.0.max_del = max_del;
77 copy
78 }
79
80 /// Sets the [`max_subst`](tre_regex_sys::regaparams_t::max_subst) element.
81 #[must_use]
82 #[inline]
83 pub const fn max_subst(&self, max_subst: c_int) -> Self {
84 let mut copy = *self;
85 copy.0.max_subst = max_subst;
86 copy
87 }
88
89 /// Sets the [`max_err`](tre_regex_sys::regaparams_t::max_err) element.
90 #[must_use]
91 #[inline]
92 pub const fn max_err(&self, max_err: c_int) -> Self {
93 let mut copy = *self;
94 copy.0.max_err = max_err;
95 copy
96 }
97
98 /// Get an immutable reference to the underlying [`regaparams_t`](tre_regex_sys::regaparams_t) object.
99 #[must_use]
100 #[inline]
101 pub const fn get(&self) -> &tre::regaparams_t {
102 &self.0
103 }
104
105 /// Get a mutable reference to the underlying [`regaparams_t`](tre_regex_sys::regaparams_t) object.
106 #[must_use]
107 #[inline]
108 pub fn get_mut(&mut self) -> &mut tre::regaparams_t {
109 &mut self.0
110 }
111}
112
113impl Default for RegApproxParams {
114 fn default() -> Self {
115 Self::new()
116 }
117}
118
119/// This struct is returned by [`regaexec`] and friends.
120///
121/// The match results from this function are very complex. See the [TRE documentation] for details
122/// on how this all works and corresponding fields, and what they mean.
123///
124/// This structure should never be instantiated outside the library.
125///
126/// [TRE documentation]: <https://laurikari.net/tre/documentation/regaexec/>
127#[derive(Clone, Debug)]
128pub struct RegApproxMatch<Data, Res> {
129 data: Data,
130 matches: Vec<Option<Res>>,
131 amatch: tre::regamatch_t,
132}
133
134impl<Data, Res> RegApproxMatch<Data, Res> {
135 pub(crate) fn new(data: Data, matches: Vec<Option<Res>>, amatch: tre::regamatch_t) -> Self {
136 Self {
137 data,
138 matches,
139 amatch,
140 }
141 }
142
143 /// Gets the cost of the match
144 pub const fn cost(&self) -> c_int {
145 self.amatch.cost
146 }
147
148 /// Gets the number of insertions if the match
149 pub const fn num_ins(&self) -> c_int {
150 self.amatch.num_ins
151 }
152
153 /// Gets the number of deletions if the match
154 pub const fn num_del(&self) -> c_int {
155 self.amatch.num_del
156 }
157
158 /// Get the number of substitutions in the match
159 pub const fn num_subst(&self) -> c_int {
160 self.amatch.num_subst
161 }
162
163 /// Gets an immutable reference to the underlying data
164 pub const fn get_orig_data(&self) -> &Data {
165 &self.data
166 }
167
168 /// Gets the matches returned by this, as references to the data
169 pub const fn get_matches(&self) -> &Vec<Option<Res>> {
170 &self.matches
171 }
172
173 /// Gets a reference to the underlying [`regamatch_t`](tre_regex_sys::regamatch_t) object.
174 pub const fn get_regamatch(&self) -> &tre::regamatch_t {
175 &self.amatch
176 }
177}
178
179impl Regex {
180 /// Performs an approximate regex search on the passed string, returning `nmatches` results.
181 ///
182 /// Non-matching subexpressions or patterns will return `None` in the results.
183 ///
184 /// # Arguments
185 /// * `string`: string to match against `compiled_reg`
186 /// * `params`: see [`RegApproxParams`]
187 /// * `nmatches`: number of matches to return
188 /// * `flags`: [`RegexecFlags`] to pass to [`tre_reganexec`](tre_regex_sys::tre_reganexec).
189 ///
190 /// # Returns
191 /// If no error was found, a [`Vec`] of [`Option`]s will be returned.
192 ///
193 /// If a given match index is empty, The `Option` will be `None`. Otherwise, [`Result`]s will be
194 /// returned, containing either errors or substrings of the matches. Errors may be returned due to
195 /// decoding problems, such as split codepoints.
196 ///
197 /// # Errors
198 /// If an error is encountered during matching, it returns a [`RegexError`]. Match results may also
199 /// return errors, if decoding into UTF-8 was unsuccessful for whatever reason.
200 ///
201 /// # Caveats
202 /// Unless copied, the match results must live at least as long as `string`. This is because they are
203 /// slices into `string` under the hood, for efficiency.
204 ///
205 /// # Examples
206 /// ```
207 /// # use tre_regex::Result;
208 /// # fn main() -> Result<()> {
209 /// use tre_regex::{RegcompFlags, RegexecFlags, RegApproxParams, Regex};
210 ///
211 /// let regcomp_flags = RegcompFlags::new()
212 /// .add(RegcompFlags::EXTENDED)
213 /// .add(RegcompFlags::ICASE);
214 /// let regaexec_flags = RegexecFlags::new().add(RegexecFlags::NONE);
215 /// let regaexec_params = RegApproxParams::new()
216 /// .cost_ins(1)
217 /// .cost_del(1)
218 /// .cost_subst(1)
219 /// .max_cost(2)
220 /// .max_del(2)
221 /// .max_ins(2)
222 /// .max_subst(2)
223 /// .max_err(2);
224 ///
225 /// let compiled_reg = Regex::new("^(hello).*(world)$", regcomp_flags)?;
226 /// let result = compiled_reg.regaexec(
227 /// "hello world", // String to match against
228 /// ®aexec_params, // Matching parameters
229 /// 3, // Number of matches we want
230 /// regaexec_flags // Flags
231 /// )?;
232 ///
233 /// for (i, matched) in result.get_matches().into_iter().enumerate() {
234 /// match matched {
235 /// Some(substr) => println!("Match {i}: {}", substr.as_ref().unwrap()),
236 /// None => println!("Match {i}: <None>"),
237 /// }
238 /// }
239 /// # Ok(())
240 /// # }
241 /// ```
242 #[inline]
243 pub fn regaexec<'a>(
244 &self,
245 string: &'a str,
246 params: &RegApproxParams,
247 nmatches: usize,
248 flags: RegexecFlags,
249 ) -> Result<RegApproxMatchStr<'a>> {
250 let data = string.as_bytes();
251 let match_results = self.regaexec_bytes(data, params, nmatches, flags)?;
252
253 let mut result: Vec<Option<Result<Cow<'a, str>>>> = Vec::with_capacity(nmatches);
254 for pmatch in match_results.get_matches() {
255 let Some(pmatch) = pmatch else {
256 result.push(None);
257 continue;
258 };
259
260 #[allow(clippy::match_wildcard_for_single_variants)]
261 result.push(Some(match pmatch {
262 Cow::Borrowed(pmatch) => match std::str::from_utf8(pmatch) {
263 Ok(s) => Ok(s.into()),
264 Err(e) => Err(RegexError::new(
265 ErrorKind::Binding(BindingErrorCode::ENCODING),
266 &format!("UTF-8 encoding error: {e}"),
267 )),
268 },
269 // SAFETY: cannot get here, we only have borrowed values.
270 _ => unsafe { unreachable_unchecked() },
271 }));
272 }
273
274 Ok(RegApproxMatchStr::new(
275 string,
276 result,
277 *match_results.get_regamatch(),
278 ))
279 }
280
281 /// Performs an approximate regex search on the passed bytes, returning `nmatches` results.
282 ///
283 /// This function should only be used if you need to match raw bytes, or bytes which may not be
284 /// UTF-8 compliant. Otherwise, [`regaexec`] is recommended instead.
285 ///
286 /// # Arguments
287 /// * `data`: [`u8`] slice to match against `compiled_reg`
288 /// * `params`: see [`RegApproxParams`]
289 /// * `nmatches`: number of matches to return
290 /// * `flags`: [`RegexecFlags`] to pass to [`tre_reganexec`](tre_regex_sys::tre_reganexec).
291 ///
292 /// # Returns
293 /// If no error was found, a [`Vec`] of [`Option`]s will be returned.
294 ///
295 /// If a given match index is empty, The `Option` will be `None`. Otherwise, [`u8`] slices will be
296 /// returned.
297 ///
298 /// # Errors
299 /// If an error is encountered during matching, it returns a [`RegexError`].
300 ///
301 /// # Caveats
302 /// Unless copied, the match results must live at least as long as `data`. This is because they are
303 /// slices into `data` under the hood, for efficiency.
304 ///
305 /// # Examples
306 /// ```
307 /// # use tre_regex::Result;
308 /// # fn main() -> Result<()> {
309 /// use tre_regex::{RegcompFlags, RegexecFlags, RegApproxParams, Regex};
310 ///
311 /// let regcomp_flags = RegcompFlags::new()
312 /// .add(RegcompFlags::EXTENDED)
313 /// .add(RegcompFlags::ICASE);
314 /// let regaexec_flags = RegexecFlags::new().add(RegexecFlags::NONE);
315 /// let regaexec_params = RegApproxParams::new()
316 /// .cost_ins(1)
317 /// .cost_del(1)
318 /// .cost_subst(1)
319 /// .max_cost(2)
320 /// .max_del(2)
321 /// .max_ins(2)
322 /// .max_subst(2)
323 /// .max_err(2);
324 ///
325 /// let compiled_reg = Regex::new("^(hello).*(world)$", regcomp_flags)?;
326 /// let result = compiled_reg.regaexec_bytes(
327 /// b"hello world", // Bytes to match against
328 /// ®aexec_params, // Matching parameters
329 /// 3, // Number of matches we want
330 /// regaexec_flags // Flags
331 /// )?;
332 ///
333 /// for (i, matched) in result.get_matches().into_iter().enumerate() {
334 /// match matched {
335 /// Some(substr) => println!(
336 /// "Match {i}: {}",
337 /// std::str::from_utf8(substr).unwrap()
338 /// ),
339 /// None => println!("Match {i}: <None>"),
340 /// }
341 /// }
342 /// # Ok(())
343 /// # }
344 /// ```
345 pub fn regaexec_bytes<'a>(
346 &self,
347 data: &'a [u8],
348 params: &RegApproxParams,
349 nmatches: usize,
350 flags: RegexecFlags,
351 ) -> Result<RegApproxMatchBytes<'a>> {
352 let Some(compiled_reg_obj) = self.get() else {
353 return Err(RegexError::new(
354 ErrorKind::Binding(BindingErrorCode::REGEX_VACANT),
355 "Attempted to unwrap a vacant Regex object",
356 ));
357 };
358 let mut match_vec: Vec<tre::regmatch_t> =
359 vec![tre::regmatch_t { rm_so: 0, rm_eo: 0 }; nmatches];
360 let mut amatch = tre::regamatch_t {
361 nmatch: nmatches,
362 pmatch: match_vec.as_mut_ptr(),
363 ..Default::default()
364 };
365
366 // SAFETY: compiled_reg is a wrapped type (see safety concerns for Regex). data is read-only.
367 // match_vec has enough room for everything. flags also cannot wrap around.
368 #[allow(clippy::cast_possible_wrap)]
369 let result = unsafe {
370 tre::tre_reganexec(
371 compiled_reg_obj,
372 data.as_ptr().cast::<i8>(),
373 data.len(),
374 &mut amatch,
375 *params.get(),
376 flags.get(),
377 )
378 };
379 if result != 0 {
380 return Err(self.regerror(result));
381 }
382
383 let mut result: Vec<Option<Cow<'a, [u8]>>> = Vec::with_capacity(nmatches);
384 for pmatch in match_vec {
385 if pmatch.rm_so < 0 || pmatch.rm_eo < 0 {
386 result.push(None);
387 continue;
388 }
389
390 // Wraparound is impossible.
391 #[allow(clippy::cast_sign_loss)]
392 let start_offset = pmatch.rm_so as usize;
393 #[allow(clippy::cast_sign_loss)]
394 let end_offset = pmatch.rm_eo as usize;
395
396 result.push(Some(Cow::Borrowed(&data[start_offset..end_offset])));
397 }
398
399 Ok(RegApproxMatchBytes::new(data, result, amatch))
400 }
401}
402
403/// Performs an approximate regex search on the passed string, returning `nmatches` results.
404///
405/// This is a thin wrapper around [`Regex::regaexec`].
406///
407/// Non-matching subexpressions or patterns will return `None` in the results.
408///
409/// # Arguments
410/// * `compiled_reg`: the compiled [`Regex`] object.
411/// * `string`: string to match against `compiled_reg`
412/// * `params`: see [`RegApproxParams`]
413/// * `nmatches`: number of matches to return
414/// * `flags`: [`RegexecFlags`] to pass to [`tre_regnexec`](tre_regex_sys::tre_regnexec).
415///
416/// # Returns
417/// If no error was found, a [`Vec`] of [`Option`]s will be returned.
418///
419/// If a given match index is empty, The `Option` will be `None`. Otherwise, [`Result`]s will be
420/// returned, containing either errors or substrings of the matches. Errors may be returned due to
421/// decoding problems, such as split codepoints.
422///
423/// # Errors
424/// If an error is encountered during matching, it returns a [`RegexError`]. Match results may also
425/// return errors, if decoding into UTF-8 was unsuccessful for whatever reason.
426///
427/// # Caveats
428/// Unless copied, the match results must live at least as long as `string`. This is because they are
429/// slices into `string` under the hood, for efficiency.
430///
431/// # Examples
432/// ```
433/// # use tre_regex::Result;
434/// # fn main() -> Result<()> {
435/// use tre_regex::{RegcompFlags, RegexecFlags, RegApproxParams, Regex, regaexec};
436///
437/// let regcomp_flags = RegcompFlags::new()
438/// .add(RegcompFlags::EXTENDED)
439/// .add(RegcompFlags::ICASE);
440/// let regaexec_flags = RegexecFlags::new().add(RegexecFlags::NONE);
441/// let regaexec_params = RegApproxParams::new()
442/// .cost_ins(1)
443/// .cost_del(1)
444/// .cost_subst(1)
445/// .max_cost(2)
446/// .max_del(2)
447/// .max_ins(2)
448/// .max_subst(2)
449/// .max_err(2);
450///
451/// let compiled_reg = Regex::new("^(hello).*(world)$", regcomp_flags)?;
452/// let result = regaexec(
453/// &compiled_reg, // Compiled regex
454/// "hello world", // String to match against
455/// ®aexec_params, // Matching parameters
456/// 3, // Number of matches we want
457/// regaexec_flags // Flags
458/// )?;
459///
460/// for (i, matched) in result.get_matches().into_iter().enumerate() {
461/// match matched {
462/// Some(substr) => println!("Match {i}: {}", substr.as_ref().unwrap()),
463/// None => println!("Match {i}: <None>"),
464/// }
465/// }
466/// # Ok(())
467/// # }
468/// ```
469#[inline]
470pub fn regaexec<'a>(
471 compiled_reg: &Regex,
472 string: &'a str,
473 params: &RegApproxParams,
474 nmatches: usize,
475 flags: RegexecFlags,
476) -> Result<RegApproxMatchStr<'a>> {
477 compiled_reg.regaexec(string, params, nmatches, flags)
478}
479
480/// Performs an approximate regex search on the passed bytes, returning `nmatches` results.
481///
482/// This is a thin wrapper around [`Regex::regaexec_bytes`].
483///
484/// This function should only be used if you need to match raw bytes, or bytes which may not be
485/// UTF-8 compliant. Otherwise, [`regaexec`] is recommended instead.
486///
487/// # Arguments
488/// * `compiled_reg`: the compiled [`Regex`] object.
489/// * `data`: [`u8`] slice to match against `compiled_reg`
490/// * `params`: see [`RegApproxParams`]
491/// * `nmatches`: number of matches to return
492/// * `flags`: [`RegexecFlags`] to pass to [`tre_regnexec`](tre_regex_sys::tre_regnexec).
493///
494/// # Returns
495/// If no error was found, a [`Vec`] of [`Option`]s will be returned.
496///
497/// If a given match index is empty, The `Option` will be `None`. Otherwise, [`u8`] slices will be
498/// returned.
499///
500/// # Errors
501/// If an error is encountered during matching, it returns a [`RegexError`].
502///
503/// # Caveats
504/// Unless copied, the match results must live at least as long as `data`. This is because they are
505/// slices into `data` under the hood, for efficiency.
506///
507/// # Examples
508/// ```
509/// # use tre_regex::Result;
510/// # fn main() -> Result<()> {
511/// use tre_regex::{RegcompFlags, RegexecFlags, RegApproxParams, Regex, regaexec_bytes};
512///
513/// let regcomp_flags = RegcompFlags::new()
514/// .add(RegcompFlags::EXTENDED)
515/// .add(RegcompFlags::ICASE);
516/// let regaexec_flags = RegexecFlags::new().add(RegexecFlags::NONE);
517/// let regaexec_params = RegApproxParams::new()
518/// .cost_ins(1)
519/// .cost_del(1)
520/// .cost_subst(1)
521/// .max_cost(2)
522/// .max_del(2)
523/// .max_ins(2)
524/// .max_subst(2)
525/// .max_err(2);
526///
527/// let compiled_reg = Regex::new("^(hello).*(world)$", regcomp_flags)?;
528/// let result = regaexec_bytes(
529/// &compiled_reg, // Compiled regex
530/// b"hello world", // Bytes to match against
531/// ®aexec_params, // Matching parameters
532/// 3, // Number of matches we want
533/// regaexec_flags // Flags
534/// )?;
535///
536/// for (i, matched) in result.get_matches().into_iter().enumerate() {
537/// match matched {
538/// Some(substr) => println!(
539/// "Match {i}: {}",
540/// std::str::from_utf8(substr).unwrap()
541/// ),
542/// None => println!("Match {i}: <None>"),
543/// }
544/// }
545/// # Ok(())
546/// # }
547/// ```
548#[inline]
549pub fn regaexec_bytes<'a>(
550 compiled_reg: &Regex,
551 data: &'a [u8],
552 params: &RegApproxParams,
553 nmatches: usize,
554 flags: RegexecFlags,
555) -> Result<RegApproxMatchBytes<'a>> {
556 compiled_reg.regaexec_bytes(data, params, nmatches, flags)
557}