tre_regex/
exec.rs

1use std::borrow::Cow;
2use std::hint::unreachable_unchecked;
3
4use crate::{err::{BindingErrorCode, ErrorKind, RegexError, Result}, flags::RegexecFlags, tre, Regex};
5
6pub type RegMatchStr<'a> = Vec<Option<Result<Cow<'a, str>>>>;
7pub type RegMatchBytes<'a> = Vec<Option<Cow<'a, [u8]>>>;
8
9impl Regex {
10    /// Performs a regex search on the passed string, returning `nmatches` results.
11    ///
12    /// Non-matching subexpressions or patterns will return `None` in the results.
13    ///
14    /// # Arguments
15    /// * `string`: string to match against `compiled_reg`
16    /// * `nmatches`: number of matches to return
17    /// * `flags`: [`RegexecFlags`] to pass to [`tre_regnexec`](tre_regex_sys::tre_regnexec).
18    ///
19    /// # Returns
20    /// If no error was found, a [`Vec`] of [`Option`]s will be returned.
21    ///
22    /// If a given match index is empty, The `Option` will be `None`. Otherwise, [`Result`]s will be
23    /// returned, containing either errors or substrings of the matches. Errors may be returned due to
24    /// decoding problems, such as split codepoints.
25    ///
26    /// # Errors
27    /// If an error is encountered during matching, it returns a [`RegexError`]. Match results may also
28    /// return errors, if decoding into UTF-8 was unsuccessful for whatever reason.
29    ///
30    /// # Caveats
31    /// Unless copied, the match results must live at least as long as `string`. This is because they are
32    /// slices into `string` under the hood, for efficiency.
33    ///
34    /// # Examples
35    /// ```
36    /// # use tre_regex::Result;
37    /// # fn main() -> Result<()> {
38    /// use tre_regex::{RegcompFlags, RegexecFlags, Regex};
39    ///
40    /// let regcomp_flags = RegcompFlags::new()
41    ///     .add(RegcompFlags::EXTENDED)
42    ///     .add(RegcompFlags::ICASE)
43    ///     .add(RegcompFlags::UNGREEDY);
44    /// let regexec_flags = RegexecFlags::new().add(RegexecFlags::NONE);
45    ///
46    /// let compiled_reg = Regex::new("^(hello).*(world)$", regcomp_flags)?;
47    /// let matches = compiled_reg.regexec("hello world", 3, regexec_flags)?;
48    ///
49    /// for (i, matched) in matches.into_iter().enumerate() {
50    ///     match matched {
51    ///         Some(res) => {
52    ///             match res {
53    ///                 Ok(substr) => println!("Match {i}: '{}'", substr),
54    ///                 Err(e) => println!("Match {i}: <Error: {e}>"),
55    ///             }
56    ///         },
57    ///         None => println!("Match {i}: <None>"),
58    ///     }
59    /// }
60    /// # Ok(())
61    /// # }
62    /// ```
63    ///
64    /// [`RegexError`]: crate::RegexError
65    #[inline]
66    pub fn regexec<'a>(
67        &self,
68        string: &'a str,
69        nmatches: usize,
70        flags: RegexecFlags,
71    ) -> Result<RegMatchStr<'a>> {
72        let data = string.as_bytes();
73        let match_results = self.regexec_bytes(data, nmatches, flags)?;
74
75        let mut result: Vec<Option<Result<Cow<'a, str>>>> = Vec::with_capacity(nmatches);
76        for pmatch in match_results {
77            let Some(pmatch) = pmatch else {
78                result.push(None);
79                continue;
80            };
81
82            #[allow(clippy::match_wildcard_for_single_variants)]
83            result.push(Some(match pmatch {
84                Cow::Borrowed(pmatch) => match std::str::from_utf8(pmatch) {
85                    Ok(s) => Ok(s.into()),
86                    Err(e) => Err(RegexError::new(
87                        ErrorKind::Binding(BindingErrorCode::ENCODING),
88                        &format!("UTF-8 encoding error: {e}"),
89                    )),
90                },
91                // SAFETY: cannot get here, we only have borrowed values.
92                _ => unsafe { unreachable_unchecked() },
93            }));
94        }
95
96        Ok(result)
97    }
98
99    /// Performs a regex search on the passed bytes, returning `nmatches` results.
100    ///
101    /// This function should only be used if you need to match raw bytes, or bytes which may not be
102    /// UTF-8 compliant. Otherwise, [`regexec`] is recommended instead.
103    ///
104    /// # Arguments
105    /// * `data`: [`u8`] slice to match against `compiled_reg`
106    /// * `nmatches`: number of matches to return
107    /// * `flags`: [`RegexecFlags`] to pass to [`tre_regnexec`](tre_regex_sys::tre_regnexec).
108    ///
109    /// # Returns
110    /// If no error was found, a [`Vec`] of [`Option`]s will be returned.
111    ///
112    /// If a given match index is empty, The `Option` will be `None`. Otherwise, [`u8`] slices will be
113    /// returned.
114    ///
115    /// # Errors
116    /// If an error is encountered during matching, it returns a [`RegexError`].
117    ///
118    /// # Caveats
119    /// Unless copied, the match results must live at least as long as `data`. This is because they are
120    /// slices into `data` under the hood, for efficiency.
121    ///
122    /// # Examples
123    /// ```
124    /// # use tre_regex::Result;
125    /// # fn main() -> Result<()> {
126    /// use tre_regex::{RegcompFlags, RegexecFlags, Regex};
127    ///
128    /// let regcomp_flags = RegcompFlags::new()
129    ///     .add(RegcompFlags::EXTENDED)
130    ///     .add(RegcompFlags::ICASE);
131    /// let regexec_flags = RegexecFlags::new().add(RegexecFlags::NONE);
132    ///
133    /// let compiled_reg = Regex::new("^(hello).*(world)$", regcomp_flags)?;
134    /// let matches = compiled_reg.regexec_bytes(b"hello world", 2, regexec_flags)?;
135    ///
136    /// for (i, matched) in matches.into_iter().enumerate() {
137    ///     match matched {
138    ///         Some(substr) => println!(
139    ///             "Match {i}: {}",
140    ///             std::str::from_utf8(substr.as_ref()).unwrap()
141    ///         ),
142    ///         None => println!("Match {i}: <None>"),
143    ///     }
144    /// }
145    /// # Ok(())
146    /// # }
147    /// ```
148    pub fn regexec_bytes<'a>(
149        &self,
150        data: &'a [u8],
151        nmatches: usize,
152        flags: RegexecFlags,
153    ) -> Result<RegMatchBytes<'a>> {
154        let Some(compiled_reg_obj) = self.get() else {
155            return Err(RegexError::new(
156                ErrorKind::Binding(BindingErrorCode::REGEX_VACANT),
157                "Attempted to unwrap a vacant Regex object",
158            ));
159        };
160        let mut match_vec: Vec<tre::regmatch_t> =
161            vec![tre::regmatch_t { rm_so: 0, rm_eo: 0 }; nmatches];
162
163        // SAFETY: compiled_reg is a wrapped type (see safety concerns for Regex). data is read-only.
164        // match_vec has enough room for everything. flags also cannot wrap around.
165        #[allow(clippy::cast_possible_wrap)]
166        let result = unsafe {
167            tre::tre_regnexec(
168                compiled_reg_obj,
169                data.as_ptr().cast::<i8>(),
170                data.len(),
171                nmatches,
172                match_vec.as_mut_ptr(),
173                flags.get(),
174            )
175        };
176        if result != 0 {
177            return Err(self.regerror(result));
178        }
179
180        let mut result: Vec<Option<Cow<'a, [u8]>>> = Vec::with_capacity(nmatches);
181        for pmatch in match_vec {
182            if pmatch.rm_so < 0 || pmatch.rm_eo < 0 {
183                result.push(None);
184                continue;
185            }
186
187            // Wraparound is impossible.
188            #[allow(clippy::cast_sign_loss)]
189            let start_offset = pmatch.rm_so as usize;
190            #[allow(clippy::cast_sign_loss)]
191            let end_offset = pmatch.rm_eo as usize;
192
193            result.push(Some(Cow::Borrowed(&data[start_offset..end_offset])));
194        }
195
196        Ok(result)
197    }
198}
199
200/// Performs a regex search on the passed string, returning `nmatches` results.
201///
202/// This is a thin wrapper around [`Regex::regexec`].
203///
204/// Non-matching subexpressions or patterns will return `None` in the results.
205///
206/// # Arguments
207/// * `compiled_reg`: the compiled [`Regex`] object.
208/// * `string`: string to match against `compiled_reg`
209/// * `nmatches`: number of matches to return
210/// * `flags`: [`RegexecFlags`] to pass to [`tre_regnexec`](tre_regex_sys::tre_regnexec).
211///
212/// # Returns
213/// If no error was found, a [`Vec`] of [`Option`]s will be returned.
214///
215/// If a given match index is empty, The `Option` will be `None`. Otherwise, [`Result`]s will be
216/// returned, containing either errors or substrings of the matches. Errors may be returned due to
217/// decoding problems, such as split codepoints.
218///
219/// # Errors
220/// If an error is encountered during matching, it returns a [`RegexError`]. Match results may also
221/// return errors, if decoding into UTF-8 was unsuccessful for whatever reason.
222///
223/// # Caveats
224/// Unless copied, the match results must live at least as long as `string`. This is because they are
225/// slices into `string` under the hood, for efficiency.
226///
227/// # Examples
228/// ```
229/// # use tre_regex::Result;
230/// # fn main() -> Result<()> {
231/// use tre_regex::{RegcompFlags, RegexecFlags, regcomp, regexec};
232///
233/// let regcomp_flags = RegcompFlags::new()
234///     .add(RegcompFlags::EXTENDED)
235///     .add(RegcompFlags::ICASE)
236///     .add(RegcompFlags::UNGREEDY);
237/// let regexec_flags = RegexecFlags::new().add(RegexecFlags::NONE);
238///
239/// let compiled_reg = regcomp("^(hello).*(world)$", regcomp_flags)?;
240/// let matches = regexec(
241///     &compiled_reg,  // Compiled regex
242///     "hello world",  // String to match against
243///     2,              // Number of matches
244///     regexec_flags   // Flags
245/// )?;
246///
247/// for (i, matched) in matches.into_iter().enumerate() {
248///     match matched {
249///         Some(substr) => {
250///             match substr {
251///                 Ok(substr) => println!("Match {i}: '{}'", substr),
252///                 Err(e) => println!("Match {i}: <Error: {e}>"),
253///             }
254///         },
255///         None => println!("Match {i}: <None>"),
256///     }
257/// }
258/// # Ok(())
259/// # }
260/// ```
261#[inline]
262pub fn regexec<'a>(
263    compiled_reg: &Regex,
264    string: &'a str,
265    nmatches: usize,
266    flags: RegexecFlags,
267) -> Result<RegMatchStr<'a>> {
268    compiled_reg.regexec(string, nmatches, flags)
269}
270
271/// Performs a regex search on the passed bytes, returning `nmatches` results.
272///
273/// This is a thin wrapper around [`Regex::regexec_bytes`].
274///
275/// This function should only be used if you need to match raw bytes, or bytes which may not be
276/// UTF-8 compliant. Otherwise, [`regexec`] is recommended instead.
277///
278/// # Arguments
279/// * `compiled_reg`: the compiled [`Regex`] object.
280/// * `data`: [`u8`] slice to match against `compiled_reg`
281/// * `nmatches`: number of matches to return
282/// * `flags`: [`RegexecFlags`] to pass to [`tre_regnexec`](tre_regex_sys::tre_regnexec).
283///
284/// # Returns
285/// If no error was found, a [`Vec`] of [`Option`]s will be returned.
286///
287/// If a given match index is empty, The `Option` will be `None`. Otherwise, [`u8`] slices will be
288/// returned.
289///
290/// # Errors
291/// If an error is encountered during matching, it returns a [`RegexError`].
292///
293/// # Caveats
294/// Unless copied, the match results must live at least as long as `data`. This is because they are
295/// slices into `data` under the hood, for efficiency.
296///
297/// # Examples
298/// ```
299/// # use tre_regex::Result;
300/// # fn main() -> Result<()> {
301/// use tre_regex::{RegcompFlags, RegexecFlags, regcomp, regexec_bytes};
302///
303/// let regcomp_flags = RegcompFlags::new()
304///     .add(RegcompFlags::EXTENDED)
305///     .add(RegcompFlags::ICASE)
306///     .add(RegcompFlags::UNGREEDY);
307/// let regexec_flags = RegexecFlags::new().add(RegexecFlags::NONE);
308///
309/// let compiled_reg = regcomp("^(hello).*(world)$", regcomp_flags)?;
310/// let matches = regexec_bytes(
311///     &compiled_reg,  // Compiled regex
312///     b"hello world", // Bytes to match against
313///     2,              // Number of matches
314///     regexec_flags   // Flags
315/// )?;
316///
317/// for (i, matched) in matches.into_iter().enumerate() {
318///     match matched {
319///         Some(substr) => println!(
320///             "Match {i}: {}",
321///             std::str::from_utf8(substr.as_ref()).unwrap()
322///         ),
323///         None => println!("Match {i}: <None>"),
324///     }
325/// }
326/// # Ok(())
327/// # }
328/// ```
329pub fn regexec_bytes<'a>(
330    compiled_reg: &Regex,
331    data: &'a [u8],
332    nmatches: usize,
333    flags: RegexecFlags,
334) -> Result<RegMatchBytes<'a>> {
335    compiled_reg.regexec_bytes(data, nmatches, flags)
336}