tre_regex/exec.rs
1use std::borrow::Cow;
2use std::hint::unreachable_unchecked;
3
4use crate::{err::{BindingErrorCode, ErrorKind, RegexError, Result}, flags::RegexecFlags, tre, Regex};
5
6pub type RegMatchStr<'a> = Vec<Option<Result<Cow<'a, str>>>>;
7pub type RegMatchBytes<'a> = Vec<Option<Cow<'a, [u8]>>>;
8
9impl Regex {
10 /// Performs a regex search on the passed string, returning `nmatches` results.
11 ///
12 /// Non-matching subexpressions or patterns will return `None` in the results.
13 ///
14 /// # Arguments
15 /// * `string`: string to match against `compiled_reg`
16 /// * `nmatches`: number of matches to return
17 /// * `flags`: [`RegexecFlags`] to pass to [`tre_regnexec`](tre_regex_sys::tre_regnexec).
18 ///
19 /// # Returns
20 /// If no error was found, a [`Vec`] of [`Option`]s will be returned.
21 ///
22 /// If a given match index is empty, The `Option` will be `None`. Otherwise, [`Result`]s will be
23 /// returned, containing either errors or substrings of the matches. Errors may be returned due to
24 /// decoding problems, such as split codepoints.
25 ///
26 /// # Errors
27 /// If an error is encountered during matching, it returns a [`RegexError`]. Match results may also
28 /// return errors, if decoding into UTF-8 was unsuccessful for whatever reason.
29 ///
30 /// # Caveats
31 /// Unless copied, the match results must live at least as long as `string`. This is because they are
32 /// slices into `string` under the hood, for efficiency.
33 ///
34 /// # Examples
35 /// ```
36 /// # use tre_regex::Result;
37 /// # fn main() -> Result<()> {
38 /// use tre_regex::{RegcompFlags, RegexecFlags, Regex};
39 ///
40 /// let regcomp_flags = RegcompFlags::new()
41 /// .add(RegcompFlags::EXTENDED)
42 /// .add(RegcompFlags::ICASE)
43 /// .add(RegcompFlags::UNGREEDY);
44 /// let regexec_flags = RegexecFlags::new().add(RegexecFlags::NONE);
45 ///
46 /// let compiled_reg = Regex::new("^(hello).*(world)$", regcomp_flags)?;
47 /// let matches = compiled_reg.regexec("hello world", 3, regexec_flags)?;
48 ///
49 /// for (i, matched) in matches.into_iter().enumerate() {
50 /// match matched {
51 /// Some(res) => {
52 /// match res {
53 /// Ok(substr) => println!("Match {i}: '{}'", substr),
54 /// Err(e) => println!("Match {i}: <Error: {e}>"),
55 /// }
56 /// },
57 /// None => println!("Match {i}: <None>"),
58 /// }
59 /// }
60 /// # Ok(())
61 /// # }
62 /// ```
63 ///
64 /// [`RegexError`]: crate::RegexError
65 #[inline]
66 pub fn regexec<'a>(
67 &self,
68 string: &'a str,
69 nmatches: usize,
70 flags: RegexecFlags,
71 ) -> Result<RegMatchStr<'a>> {
72 let data = string.as_bytes();
73 let match_results = self.regexec_bytes(data, nmatches, flags)?;
74
75 let mut result: Vec<Option<Result<Cow<'a, str>>>> = Vec::with_capacity(nmatches);
76 for pmatch in match_results {
77 let Some(pmatch) = pmatch else {
78 result.push(None);
79 continue;
80 };
81
82 #[allow(clippy::match_wildcard_for_single_variants)]
83 result.push(Some(match pmatch {
84 Cow::Borrowed(pmatch) => match std::str::from_utf8(pmatch) {
85 Ok(s) => Ok(s.into()),
86 Err(e) => Err(RegexError::new(
87 ErrorKind::Binding(BindingErrorCode::ENCODING),
88 &format!("UTF-8 encoding error: {e}"),
89 )),
90 },
91 // SAFETY: cannot get here, we only have borrowed values.
92 _ => unsafe { unreachable_unchecked() },
93 }));
94 }
95
96 Ok(result)
97 }
98
99 /// Performs a regex search on the passed bytes, returning `nmatches` results.
100 ///
101 /// This function should only be used if you need to match raw bytes, or bytes which may not be
102 /// UTF-8 compliant. Otherwise, [`regexec`] is recommended instead.
103 ///
104 /// # Arguments
105 /// * `data`: [`u8`] slice to match against `compiled_reg`
106 /// * `nmatches`: number of matches to return
107 /// * `flags`: [`RegexecFlags`] to pass to [`tre_regnexec`](tre_regex_sys::tre_regnexec).
108 ///
109 /// # Returns
110 /// If no error was found, a [`Vec`] of [`Option`]s will be returned.
111 ///
112 /// If a given match index is empty, The `Option` will be `None`. Otherwise, [`u8`] slices will be
113 /// returned.
114 ///
115 /// # Errors
116 /// If an error is encountered during matching, it returns a [`RegexError`].
117 ///
118 /// # Caveats
119 /// Unless copied, the match results must live at least as long as `data`. This is because they are
120 /// slices into `data` under the hood, for efficiency.
121 ///
122 /// # Examples
123 /// ```
124 /// # use tre_regex::Result;
125 /// # fn main() -> Result<()> {
126 /// use tre_regex::{RegcompFlags, RegexecFlags, Regex};
127 ///
128 /// let regcomp_flags = RegcompFlags::new()
129 /// .add(RegcompFlags::EXTENDED)
130 /// .add(RegcompFlags::ICASE);
131 /// let regexec_flags = RegexecFlags::new().add(RegexecFlags::NONE);
132 ///
133 /// let compiled_reg = Regex::new("^(hello).*(world)$", regcomp_flags)?;
134 /// let matches = compiled_reg.regexec_bytes(b"hello world", 2, regexec_flags)?;
135 ///
136 /// for (i, matched) in matches.into_iter().enumerate() {
137 /// match matched {
138 /// Some(substr) => println!(
139 /// "Match {i}: {}",
140 /// std::str::from_utf8(substr.as_ref()).unwrap()
141 /// ),
142 /// None => println!("Match {i}: <None>"),
143 /// }
144 /// }
145 /// # Ok(())
146 /// # }
147 /// ```
148 pub fn regexec_bytes<'a>(
149 &self,
150 data: &'a [u8],
151 nmatches: usize,
152 flags: RegexecFlags,
153 ) -> Result<RegMatchBytes<'a>> {
154 let Some(compiled_reg_obj) = self.get() else {
155 return Err(RegexError::new(
156 ErrorKind::Binding(BindingErrorCode::REGEX_VACANT),
157 "Attempted to unwrap a vacant Regex object",
158 ));
159 };
160 let mut match_vec: Vec<tre::regmatch_t> =
161 vec![tre::regmatch_t { rm_so: 0, rm_eo: 0 }; nmatches];
162
163 // SAFETY: compiled_reg is a wrapped type (see safety concerns for Regex). data is read-only.
164 // match_vec has enough room for everything. flags also cannot wrap around.
165 #[allow(clippy::cast_possible_wrap)]
166 let result = unsafe {
167 tre::tre_regnexec(
168 compiled_reg_obj,
169 data.as_ptr().cast::<i8>(),
170 data.len(),
171 nmatches,
172 match_vec.as_mut_ptr(),
173 flags.get(),
174 )
175 };
176 if result != 0 {
177 return Err(self.regerror(result));
178 }
179
180 let mut result: Vec<Option<Cow<'a, [u8]>>> = Vec::with_capacity(nmatches);
181 for pmatch in match_vec {
182 if pmatch.rm_so < 0 || pmatch.rm_eo < 0 {
183 result.push(None);
184 continue;
185 }
186
187 // Wraparound is impossible.
188 #[allow(clippy::cast_sign_loss)]
189 let start_offset = pmatch.rm_so as usize;
190 #[allow(clippy::cast_sign_loss)]
191 let end_offset = pmatch.rm_eo as usize;
192
193 result.push(Some(Cow::Borrowed(&data[start_offset..end_offset])));
194 }
195
196 Ok(result)
197 }
198}
199
200/// Performs a regex search on the passed string, returning `nmatches` results.
201///
202/// This is a thin wrapper around [`Regex::regexec`].
203///
204/// Non-matching subexpressions or patterns will return `None` in the results.
205///
206/// # Arguments
207/// * `compiled_reg`: the compiled [`Regex`] object.
208/// * `string`: string to match against `compiled_reg`
209/// * `nmatches`: number of matches to return
210/// * `flags`: [`RegexecFlags`] to pass to [`tre_regnexec`](tre_regex_sys::tre_regnexec).
211///
212/// # Returns
213/// If no error was found, a [`Vec`] of [`Option`]s will be returned.
214///
215/// If a given match index is empty, The `Option` will be `None`. Otherwise, [`Result`]s will be
216/// returned, containing either errors or substrings of the matches. Errors may be returned due to
217/// decoding problems, such as split codepoints.
218///
219/// # Errors
220/// If an error is encountered during matching, it returns a [`RegexError`]. Match results may also
221/// return errors, if decoding into UTF-8 was unsuccessful for whatever reason.
222///
223/// # Caveats
224/// Unless copied, the match results must live at least as long as `string`. This is because they are
225/// slices into `string` under the hood, for efficiency.
226///
227/// # Examples
228/// ```
229/// # use tre_regex::Result;
230/// # fn main() -> Result<()> {
231/// use tre_regex::{RegcompFlags, RegexecFlags, regcomp, regexec};
232///
233/// let regcomp_flags = RegcompFlags::new()
234/// .add(RegcompFlags::EXTENDED)
235/// .add(RegcompFlags::ICASE)
236/// .add(RegcompFlags::UNGREEDY);
237/// let regexec_flags = RegexecFlags::new().add(RegexecFlags::NONE);
238///
239/// let compiled_reg = regcomp("^(hello).*(world)$", regcomp_flags)?;
240/// let matches = regexec(
241/// &compiled_reg, // Compiled regex
242/// "hello world", // String to match against
243/// 2, // Number of matches
244/// regexec_flags // Flags
245/// )?;
246///
247/// for (i, matched) in matches.into_iter().enumerate() {
248/// match matched {
249/// Some(substr) => {
250/// match substr {
251/// Ok(substr) => println!("Match {i}: '{}'", substr),
252/// Err(e) => println!("Match {i}: <Error: {e}>"),
253/// }
254/// },
255/// None => println!("Match {i}: <None>"),
256/// }
257/// }
258/// # Ok(())
259/// # }
260/// ```
261#[inline]
262pub fn regexec<'a>(
263 compiled_reg: &Regex,
264 string: &'a str,
265 nmatches: usize,
266 flags: RegexecFlags,
267) -> Result<RegMatchStr<'a>> {
268 compiled_reg.regexec(string, nmatches, flags)
269}
270
271/// Performs a regex search on the passed bytes, returning `nmatches` results.
272///
273/// This is a thin wrapper around [`Regex::regexec_bytes`].
274///
275/// This function should only be used if you need to match raw bytes, or bytes which may not be
276/// UTF-8 compliant. Otherwise, [`regexec`] is recommended instead.
277///
278/// # Arguments
279/// * `compiled_reg`: the compiled [`Regex`] object.
280/// * `data`: [`u8`] slice to match against `compiled_reg`
281/// * `nmatches`: number of matches to return
282/// * `flags`: [`RegexecFlags`] to pass to [`tre_regnexec`](tre_regex_sys::tre_regnexec).
283///
284/// # Returns
285/// If no error was found, a [`Vec`] of [`Option`]s will be returned.
286///
287/// If a given match index is empty, The `Option` will be `None`. Otherwise, [`u8`] slices will be
288/// returned.
289///
290/// # Errors
291/// If an error is encountered during matching, it returns a [`RegexError`].
292///
293/// # Caveats
294/// Unless copied, the match results must live at least as long as `data`. This is because they are
295/// slices into `data` under the hood, for efficiency.
296///
297/// # Examples
298/// ```
299/// # use tre_regex::Result;
300/// # fn main() -> Result<()> {
301/// use tre_regex::{RegcompFlags, RegexecFlags, regcomp, regexec_bytes};
302///
303/// let regcomp_flags = RegcompFlags::new()
304/// .add(RegcompFlags::EXTENDED)
305/// .add(RegcompFlags::ICASE)
306/// .add(RegcompFlags::UNGREEDY);
307/// let regexec_flags = RegexecFlags::new().add(RegexecFlags::NONE);
308///
309/// let compiled_reg = regcomp("^(hello).*(world)$", regcomp_flags)?;
310/// let matches = regexec_bytes(
311/// &compiled_reg, // Compiled regex
312/// b"hello world", // Bytes to match against
313/// 2, // Number of matches
314/// regexec_flags // Flags
315/// )?;
316///
317/// for (i, matched) in matches.into_iter().enumerate() {
318/// match matched {
319/// Some(substr) => println!(
320/// "Match {i}: {}",
321/// std::str::from_utf8(substr.as_ref()).unwrap()
322/// ),
323/// None => println!("Match {i}: <None>"),
324/// }
325/// }
326/// # Ok(())
327/// # }
328/// ```
329pub fn regexec_bytes<'a>(
330 compiled_reg: &Regex,
331 data: &'a [u8],
332 nmatches: usize,
333 flags: RegexecFlags,
334) -> Result<RegMatchBytes<'a>> {
335 compiled_reg.regexec_bytes(data, nmatches, flags)
336}