regex_rs/
lib.rs

1//! Safe wrapper for [POSIX regular expressions API][regex-h] (provided by libc on POSIX-compliant OSes).
2//!
3//! [regex-h]: https://pubs.opengroup.org/onlinepubs/9699919799.2008edition/basedefs/regex.h.html#tag_13_37
4//!
5//! ```
6//! use regex_rs::*;
7//!
8//! let pattern = "This( often)? repeats time and again(, and again)*\\.";
9//! let compilation_flags = CompFlags::EXTENDED;
10//! let regex = Regex::new(pattern, compilation_flags)
11//!     .expect("Failed to compile pattern as POSIX extended regular expression");
12//!
13//! let input = "This repeats time and again, and again, and again.";
14//! // We're only interested in the first match, i.e. the part of text
15//! // that's matched by the whole regex
16//! let max_matches = 1;
17//! let match_flags = MatchFlags::empty();
18//! let matches = regex
19//!     .matches(input, max_matches, match_flags)
20//!     .expect("Error matching input against regex");
21//!
22//! // Found a match
23//! assert_eq!(matches.len(), 1);
24//!
25//! // Match spans from the beginning to the end of the input
26//! assert_eq!(matches[0].start_pos, 0);
27//! // `end_pos` holds one-past-the-end index
28//! assert_eq!(matches[0].end_pos, input.len());
29//! ```
30
31// This lint is nitpicky, I don't think it's really important how the literals are written.
32#![allow(clippy::unreadable_literal)]
33
34use bitflags::bitflags;
35use gettextrs::gettext;
36use libc::{regcomp, regerror, regex_t, regexec, regfree, regmatch_t};
37use std::ffi::{CString, OsString};
38use std::mem;
39use std::os::unix::ffi::OsStringExt;
40use std::ptr;
41use strprintf::fmt;
42
43/// POSIX regular expression.
44pub struct Regex {
45    /// Compiled POSIX regular expression.
46    regex: regex_t,
47}
48
49bitflags! {
50    /// Compilation flags.
51    ///
52    /// These affect what features are available inside the regex, and also how it's matched
53    /// against the input string.
54    pub struct CompFlags: i32 {
55        /// Use Extended Regular Expressions.
56        ///
57        /// POSIX calls this `REG_EXTENDED`.
58        const EXTENDED = libc::REG_EXTENDED;
59
60        /// Ignore case when matching.
61        ///
62        /// POSIX calls this `REG_ICASE`.
63        const IGNORE_CASE = libc::REG_ICASE;
64
65        /// Report only success or fail of the compilation.
66        ///
67        /// POSIX calls this `REG_NOSUB`.
68        const NO_SUB = libc::REG_NOSUB;
69
70        /// Give special meaning to newline characters.
71        ///
72        /// POSIX calls this `REG_NEWLINE`.
73        ///
74        /// Without this flag, newlines match themselves.
75        ///
76        /// With this flag, newlines match themselves except:
77        ///
78        /// 1. newline is not matched by `.` outside of bracket expressions or by any form of
79        ///    non-matching lists;
80        ///
81        /// 2. beginning-of-line (`^`) matches zero-width string right after newline, regardless of
82        ///    `CompFlag::NOTBOL`;
83        ///
84        /// 3. end-of-line (`$`) matches zero-width string right before a newline, regardless of
85        ///    `CompFlags::NOTEOL`.
86        const NEWLINE = libc::REG_NEWLINE;
87    }
88}
89
90bitflags! {
91    /// Matching flags.
92    ///
93    /// These affect how regex is matched against the input string.
94    pub struct MatchFlags: i32 {
95        /// The circumflex character (`^`), when taken as a special character, does not match the
96        /// beginning of string.
97        const NOTBOL = libc::REG_NOTBOL;
98
99        /// The dollar-sign (`$`), when taken as a special character, does not match the end of
100        /// string.
101        const NOTEOL = libc::REG_NOTEOL;
102    }
103}
104
105/// Start and end positions of a matched substring.
106pub struct Match {
107    /// Start position (counting from zero).
108    pub start_pos: usize,
109
110    /// One-past-end position (counting from zero).
111    pub end_pos: usize,
112}
113
114/// A wrapper around `libc::regerror()`.
115unsafe fn regex_error_to_str(errcode: libc::c_int, regex: &regex_t) -> Option<String> {
116    // Find out the size of the buffer needed to hold the error message
117    let errmsg_length = regerror(errcode, regex, ptr::null_mut(), 0);
118
119    // Allocate the buffer and get the message.
120    let mut errmsg: Vec<u8> = vec![0; errmsg_length];
121    // Casting `*mut u8` to `*mut c_char` should be safe since C doesn't really care:
122    // it can store any ASCII symbol in a `char`, disregarding signedness.
123    regerror(
124        errcode,
125        regex,
126        errmsg.as_mut_ptr() as *mut std::os::raw::c_char,
127        errmsg_length,
128    );
129
130    // Drop the trailing NUL byte that C uses to terminate strings
131    errmsg.pop();
132
133    OsString::from_vec(errmsg).into_string().ok()
134}
135
136impl Regex {
137    /// Compiles pattern as a regular expression.
138    ///
139    /// By default, pattern is assumed to be a basic regular expression. To interpret it as an
140    /// extended regular expression, add `CompFlags::EXTENDED` to the `flags`. See also other
141    /// `CompFlags` values to control some other aspects of the regex.
142    ///
143    /// # Returns
144    ///
145    /// Compiled regex or an error message.
146    pub fn new(pattern: &str, flags: CompFlags) -> Result<Regex, String> {
147        let pattern = CString::new(pattern)
148            .map_err(|_| String::from("Regular expression contains NUL byte"))?;
149
150        unsafe {
151            let mut regex: regex_t = mem::zeroed();
152            let errcode = regcomp(&mut regex, pattern.into_raw(), flags.bits());
153
154            if errcode == 0 {
155                Ok(Regex { regex })
156            } else {
157                match regex_error_to_str(errcode, &regex) {
158                    Some(regcomp_errmsg) => {
159                        let msg = fmt!(&gettext("regcomp returned code %i"), errcode);
160                        let msg = format!("{msg}: {regcomp_errmsg}");
161                        Err(msg)
162                    }
163
164                    None => Err(fmt!(&gettext("regcomp returned code %i"), errcode)),
165                }
166            }
167        }
168    }
169
170    /// Matches input string against regex, looking for up to `max_matches` matches.
171    ///
172    /// Regexes can contain parenthesized subexpressions. This method will return up to
173    /// `max_matches`-1 of those. First match is reserved for the text that the whole regex
174    /// matched.
175    ///
176    /// `flags` dictate how matching is performed. See `MatchFlags` for details.
177    ///
178    /// # Returns
179    ///
180    /// - `Ok` with an empty vector if no match found, or if `max_matches` is 0 and there were no
181    ///   errors.
182    /// - `Ok` with a non-empty vector if `max_matches` was non-zero and a match was found. First
183    ///   element of the vector is the text that regex as a whole matched. The rest of the elements
184    ///   are pieces of text that were matched by parenthesized subexpressions.
185    /// - `Err` with an error message.
186    pub fn matches(
187        &self,
188        input: &str,
189        max_matches: usize,
190        flags: MatchFlags,
191    ) -> Result<Vec<Match>, String> {
192        let input =
193            CString::new(input).map_err(|_| String::from("Input string contains NUL byte"))?;
194
195        let mut pmatch: Vec<regmatch_t>;
196
197        let errcode = unsafe {
198            pmatch = vec![mem::zeroed(); max_matches];
199
200            regexec(
201                &self.regex,
202                input.into_raw(),
203                max_matches as libc::size_t,
204                pmatch.as_mut_ptr(),
205                flags.bits(),
206            )
207        };
208
209        match errcode {
210            0 => {
211                // Success. Let's copy results
212                let mut matches: Vec<Match> = Vec::new();
213
214                for m in pmatch {
215                    if m.rm_so < 0 || m.rm_eo < 0 {
216                        // Since `max_matches` can be bigger than the number of parenthesized
217                        // blocks in the regex, it's possible that some of the `pmatch` values are
218                        // empty. We exit the loop after detecting first such value.
219                        break;
220                    }
221
222                    // It's safe to cast i32 to usize here:
223                    // - we already checked that the values aren't negative
224                    // - usize's upper bound is higher than i32's
225                    matches.push(Match {
226                        start_pos: m.rm_so as usize,
227                        end_pos: m.rm_eo as usize,
228                    });
229                }
230
231                Ok(matches)
232            }
233
234            libc::REG_NOMATCH => {
235                // Matching went okay, but nothing found
236                Ok(Vec::new())
237            }
238
239            // POSIX only specifies two return codes for regexec(), but implementations are free to
240            // extend that.
241            _ => unsafe {
242                match regex_error_to_str(errcode, &self.regex) {
243                    Some(regexec_errmsg) => {
244                        let msg = fmt!(&gettext("regexec returned code %i"), errcode);
245                        let msg = format!("{msg}: {regexec_errmsg}");
246                        Err(msg)
247                    }
248                    None => Err(fmt!(&gettext("regexec returned code %i"), errcode)),
249                }
250            },
251        }
252    }
253}
254
255impl Drop for Regex {
256    fn drop(&mut self) {
257        unsafe {
258            regfree(&mut self.regex);
259        }
260    }
261}
262
263#[cfg(test)]
264mod tests {
265    use super::*;
266
267    #[test]
268    fn matches_basic_posix_regular_expression() {
269        let regex = Regex::new("abc+", CompFlags::empty()).unwrap();
270        let matches = regex.matches("abc+others", 1, MatchFlags::empty()).unwrap();
271
272        assert_eq!(matches.len(), 1);
273
274        // Match spans from the start of the input until the 4th character
275        assert_eq!(matches[0].start_pos, 0);
276        assert_eq!(matches[0].end_pos, 4); // one-past-last offset
277    }
278
279    #[test]
280    fn matches_extended_posix_regular_expression() {
281        let regex = Regex::new("aBc+", CompFlags::EXTENDED | CompFlags::IGNORE_CASE).unwrap();
282        let matches = regex
283            .matches("AbCcCcCC and others", 1, MatchFlags::empty())
284            .unwrap();
285
286        assert_eq!(matches.len(), 1);
287
288        // Match spans from the start of the input until the 4th character
289        assert_eq!(matches[0].start_pos, 0);
290        assert_eq!(matches[0].end_pos, 8); // one-past-last offset
291    }
292
293    #[test]
294    fn returns_empty_when_regex_valid_but_no_match() {
295        let regex = Regex::new("abc", CompFlags::empty()).unwrap();
296        let matches = regex.matches("cba", 1, MatchFlags::empty()).unwrap();
297
298        assert_eq!(matches.len(), 0);
299    }
300
301    #[test]
302    fn returns_no_more_results_than_max_matches() {
303        let regex = Regex::new("(a)(b)(c)", CompFlags::EXTENDED).unwrap();
304        let max_matches = 2;
305        let matches = regex
306            .matches("abc", max_matches, MatchFlags::empty())
307            .unwrap();
308
309        assert_eq!(matches.len(), 2);
310
311        assert_eq!(matches[0].start_pos, 0);
312        assert_eq!(matches[0].end_pos, 3);
313        assert_eq!(matches[1].start_pos, 0);
314        assert_eq!(matches[1].end_pos, 1);
315    }
316
317    #[test]
318    fn returns_no_more_results_than_available() {
319        let regex = Regex::new("abc", CompFlags::EXTENDED).unwrap();
320        let max_matches = 10;
321        let matches = regex
322            .matches("abc", max_matches, MatchFlags::empty())
323            .unwrap();
324
325        assert_eq!(matches.len(), 1);
326
327        assert_eq!(matches[0].start_pos, 0);
328        assert_eq!(matches[0].end_pos, 3);
329    }
330
331    #[test]
332    fn new_returns_error_on_invalid_regex() {
333        let result = Regex::new("(abc", CompFlags::EXTENDED);
334
335        assert!(result.is_err());
336        if let Err(msg) = result {
337            // There should be at least an error code, so string can't possibly be empty
338            assert!(!msg.is_empty());
339
340            // The message shouldn't contain a C string terminator (NUL) at the end
341            assert!(!msg.ends_with('\0'));
342        }
343    }
344}