regex_rs/lib.rs
1//! Safe wrapper for [POSIX regular expressions API][regex-h] (provided by libc on POSIX-compliant OSes).
2//!
3//! [regex-h]: https://pubs.opengroup.org/onlinepubs/9699919799.2008edition/basedefs/regex.h.html#tag_13_37
4//!
5//! ```
6//! use regex_rs::*;
7//!
8//! let pattern = "This( often)? repeats time and again(, and again)*\\.";
9//! let compilation_flags = CompFlags::EXTENDED;
10//! let regex = Regex::new(pattern, compilation_flags)
11//! .expect("Failed to compile pattern as POSIX extended regular expression");
12//!
13//! let input = "This repeats time and again, and again, and again.";
14//! // We're only interested in the first match, i.e. the part of text
15//! // that's matched by the whole regex
16//! let max_matches = 1;
17//! let match_flags = MatchFlags::empty();
18//! let matches = regex
19//! .matches(input, max_matches, match_flags)
20//! .expect("Error matching input against regex");
21//!
22//! // Found a match
23//! assert_eq!(matches.len(), 1);
24//!
25//! // Match spans from the beginning to the end of the input
26//! assert_eq!(matches[0].start_pos, 0);
27//! // `end_pos` holds one-past-the-end index
28//! assert_eq!(matches[0].end_pos, input.len());
29//! ```
30
31// This lint is nitpicky, I don't think it's really important how the literals are written.
32#![allow(clippy::unreadable_literal)]
33
34use bitflags::bitflags;
35use gettextrs::gettext;
36use libc::{regcomp, regerror, regex_t, regexec, regfree, regmatch_t};
37use std::ffi::{CString, OsString};
38use std::mem;
39use std::os::unix::ffi::OsStringExt;
40use std::ptr;
41use strprintf::fmt;
42
43/// POSIX regular expression.
44pub struct Regex {
45 /// Compiled POSIX regular expression.
46 regex: regex_t,
47}
48
49bitflags! {
50 /// Compilation flags.
51 ///
52 /// These affect what features are available inside the regex, and also how it's matched
53 /// against the input string.
54 pub struct CompFlags: i32 {
55 /// Use Extended Regular Expressions.
56 ///
57 /// POSIX calls this `REG_EXTENDED`.
58 const EXTENDED = libc::REG_EXTENDED;
59
60 /// Ignore case when matching.
61 ///
62 /// POSIX calls this `REG_ICASE`.
63 const IGNORE_CASE = libc::REG_ICASE;
64
65 /// Report only success or fail of the compilation.
66 ///
67 /// POSIX calls this `REG_NOSUB`.
68 const NO_SUB = libc::REG_NOSUB;
69
70 /// Give special meaning to newline characters.
71 ///
72 /// POSIX calls this `REG_NEWLINE`.
73 ///
74 /// Without this flag, newlines match themselves.
75 ///
76 /// With this flag, newlines match themselves except:
77 ///
78 /// 1. newline is not matched by `.` outside of bracket expressions or by any form of
79 /// non-matching lists;
80 ///
81 /// 2. beginning-of-line (`^`) matches zero-width string right after newline, regardless of
82 /// `CompFlag::NOTBOL`;
83 ///
84 /// 3. end-of-line (`$`) matches zero-width string right before a newline, regardless of
85 /// `CompFlags::NOTEOL`.
86 const NEWLINE = libc::REG_NEWLINE;
87 }
88}
89
90bitflags! {
91 /// Matching flags.
92 ///
93 /// These affect how regex is matched against the input string.
94 pub struct MatchFlags: i32 {
95 /// The circumflex character (`^`), when taken as a special character, does not match the
96 /// beginning of string.
97 const NOTBOL = libc::REG_NOTBOL;
98
99 /// The dollar-sign (`$`), when taken as a special character, does not match the end of
100 /// string.
101 const NOTEOL = libc::REG_NOTEOL;
102 }
103}
104
105/// Start and end positions of a matched substring.
106pub struct Match {
107 /// Start position (counting from zero).
108 pub start_pos: usize,
109
110 /// One-past-end position (counting from zero).
111 pub end_pos: usize,
112}
113
114/// A wrapper around `libc::regerror()`.
115unsafe fn regex_error_to_str(errcode: libc::c_int, regex: ®ex_t) -> Option<String> {
116 // Find out the size of the buffer needed to hold the error message
117 let errmsg_length = regerror(errcode, regex, ptr::null_mut(), 0);
118
119 // Allocate the buffer and get the message.
120 let mut errmsg: Vec<u8> = vec![0; errmsg_length];
121 // Casting `*mut u8` to `*mut c_char` should be safe since C doesn't really care:
122 // it can store any ASCII symbol in a `char`, disregarding signedness.
123 regerror(
124 errcode,
125 regex,
126 errmsg.as_mut_ptr() as *mut std::os::raw::c_char,
127 errmsg_length,
128 );
129
130 // Drop the trailing NUL byte that C uses to terminate strings
131 errmsg.pop();
132
133 OsString::from_vec(errmsg).into_string().ok()
134}
135
136impl Regex {
137 /// Compiles pattern as a regular expression.
138 ///
139 /// By default, pattern is assumed to be a basic regular expression. To interpret it as an
140 /// extended regular expression, add `CompFlags::EXTENDED` to the `flags`. See also other
141 /// `CompFlags` values to control some other aspects of the regex.
142 ///
143 /// # Returns
144 ///
145 /// Compiled regex or an error message.
146 pub fn new(pattern: &str, flags: CompFlags) -> Result<Regex, String> {
147 let pattern = CString::new(pattern)
148 .map_err(|_| String::from("Regular expression contains NUL byte"))?;
149
150 unsafe {
151 let mut regex: regex_t = mem::zeroed();
152 let errcode = regcomp(&mut regex, pattern.into_raw(), flags.bits());
153
154 if errcode == 0 {
155 Ok(Regex { regex })
156 } else {
157 match regex_error_to_str(errcode, ®ex) {
158 Some(regcomp_errmsg) => {
159 let msg = fmt!(&gettext("regcomp returned code %i"), errcode);
160 let msg = format!("{msg}: {regcomp_errmsg}");
161 Err(msg)
162 }
163
164 None => Err(fmt!(&gettext("regcomp returned code %i"), errcode)),
165 }
166 }
167 }
168 }
169
170 /// Matches input string against regex, looking for up to `max_matches` matches.
171 ///
172 /// Regexes can contain parenthesized subexpressions. This method will return up to
173 /// `max_matches`-1 of those. First match is reserved for the text that the whole regex
174 /// matched.
175 ///
176 /// `flags` dictate how matching is performed. See `MatchFlags` for details.
177 ///
178 /// # Returns
179 ///
180 /// - `Ok` with an empty vector if no match found, or if `max_matches` is 0 and there were no
181 /// errors.
182 /// - `Ok` with a non-empty vector if `max_matches` was non-zero and a match was found. First
183 /// element of the vector is the text that regex as a whole matched. The rest of the elements
184 /// are pieces of text that were matched by parenthesized subexpressions.
185 /// - `Err` with an error message.
186 pub fn matches(
187 &self,
188 input: &str,
189 max_matches: usize,
190 flags: MatchFlags,
191 ) -> Result<Vec<Match>, String> {
192 let input =
193 CString::new(input).map_err(|_| String::from("Input string contains NUL byte"))?;
194
195 let mut pmatch: Vec<regmatch_t>;
196
197 let errcode = unsafe {
198 pmatch = vec![mem::zeroed(); max_matches];
199
200 regexec(
201 &self.regex,
202 input.into_raw(),
203 max_matches as libc::size_t,
204 pmatch.as_mut_ptr(),
205 flags.bits(),
206 )
207 };
208
209 match errcode {
210 0 => {
211 // Success. Let's copy results
212 let mut matches: Vec<Match> = Vec::new();
213
214 for m in pmatch {
215 if m.rm_so < 0 || m.rm_eo < 0 {
216 // Since `max_matches` can be bigger than the number of parenthesized
217 // blocks in the regex, it's possible that some of the `pmatch` values are
218 // empty. We exit the loop after detecting first such value.
219 break;
220 }
221
222 // It's safe to cast i32 to usize here:
223 // - we already checked that the values aren't negative
224 // - usize's upper bound is higher than i32's
225 matches.push(Match {
226 start_pos: m.rm_so as usize,
227 end_pos: m.rm_eo as usize,
228 });
229 }
230
231 Ok(matches)
232 }
233
234 libc::REG_NOMATCH => {
235 // Matching went okay, but nothing found
236 Ok(Vec::new())
237 }
238
239 // POSIX only specifies two return codes for regexec(), but implementations are free to
240 // extend that.
241 _ => unsafe {
242 match regex_error_to_str(errcode, &self.regex) {
243 Some(regexec_errmsg) => {
244 let msg = fmt!(&gettext("regexec returned code %i"), errcode);
245 let msg = format!("{msg}: {regexec_errmsg}");
246 Err(msg)
247 }
248 None => Err(fmt!(&gettext("regexec returned code %i"), errcode)),
249 }
250 },
251 }
252 }
253}
254
255impl Drop for Regex {
256 fn drop(&mut self) {
257 unsafe {
258 regfree(&mut self.regex);
259 }
260 }
261}
262
263#[cfg(test)]
264mod tests {
265 use super::*;
266
267 #[test]
268 fn matches_basic_posix_regular_expression() {
269 let regex = Regex::new("abc+", CompFlags::empty()).unwrap();
270 let matches = regex.matches("abc+others", 1, MatchFlags::empty()).unwrap();
271
272 assert_eq!(matches.len(), 1);
273
274 // Match spans from the start of the input until the 4th character
275 assert_eq!(matches[0].start_pos, 0);
276 assert_eq!(matches[0].end_pos, 4); // one-past-last offset
277 }
278
279 #[test]
280 fn matches_extended_posix_regular_expression() {
281 let regex = Regex::new("aBc+", CompFlags::EXTENDED | CompFlags::IGNORE_CASE).unwrap();
282 let matches = regex
283 .matches("AbCcCcCC and others", 1, MatchFlags::empty())
284 .unwrap();
285
286 assert_eq!(matches.len(), 1);
287
288 // Match spans from the start of the input until the 4th character
289 assert_eq!(matches[0].start_pos, 0);
290 assert_eq!(matches[0].end_pos, 8); // one-past-last offset
291 }
292
293 #[test]
294 fn returns_empty_when_regex_valid_but_no_match() {
295 let regex = Regex::new("abc", CompFlags::empty()).unwrap();
296 let matches = regex.matches("cba", 1, MatchFlags::empty()).unwrap();
297
298 assert_eq!(matches.len(), 0);
299 }
300
301 #[test]
302 fn returns_no_more_results_than_max_matches() {
303 let regex = Regex::new("(a)(b)(c)", CompFlags::EXTENDED).unwrap();
304 let max_matches = 2;
305 let matches = regex
306 .matches("abc", max_matches, MatchFlags::empty())
307 .unwrap();
308
309 assert_eq!(matches.len(), 2);
310
311 assert_eq!(matches[0].start_pos, 0);
312 assert_eq!(matches[0].end_pos, 3);
313 assert_eq!(matches[1].start_pos, 0);
314 assert_eq!(matches[1].end_pos, 1);
315 }
316
317 #[test]
318 fn returns_no_more_results_than_available() {
319 let regex = Regex::new("abc", CompFlags::EXTENDED).unwrap();
320 let max_matches = 10;
321 let matches = regex
322 .matches("abc", max_matches, MatchFlags::empty())
323 .unwrap();
324
325 assert_eq!(matches.len(), 1);
326
327 assert_eq!(matches[0].start_pos, 0);
328 assert_eq!(matches[0].end_pos, 3);
329 }
330
331 #[test]
332 fn new_returns_error_on_invalid_regex() {
333 let result = Regex::new("(abc", CompFlags::EXTENDED);
334
335 assert!(result.is_err());
336 if let Err(msg) = result {
337 // There should be at least an error code, so string can't possibly be empty
338 assert!(!msg.is_empty());
339
340 // The message shouldn't contain a C string terminator (NUL) at the end
341 assert!(!msg.ends_with('\0'));
342 }
343 }
344}