syntect_no_panic/parsing/
regex.rs

1use crate::parsing::ParseSyntaxError;
2use once_cell::sync::OnceCell;
3use serde::de::{Deserialize, Deserializer};
4use serde::ser::{Serialize, Serializer};
5use std::error::Error;
6
7/// An abstraction for regex patterns.
8///
9/// * Allows swapping out the regex implementation because it's only in this module.
10/// * Makes regexes serializable and deserializable using just the pattern string.
11/// * Lazily compiles regexes on first use to improve initialization time.
12#[derive(Debug)]
13pub struct Regex {
14    regex_str: String,
15    regex: OnceCell<Result<regex_impl::Regex, String>>,
16}
17
18/// A region contains text positions for capture groups in a match result.
19#[derive(Clone, Debug, Eq, PartialEq)]
20pub struct Region {
21    region: regex_impl::Region,
22}
23
24impl Regex {
25    /// Create a new regex from the pattern string.
26    ///
27    /// Note that the regex compilation happens on first use, which is why this method does not
28    /// return a result.
29    pub fn new(regex_str: String) -> Self {
30        Self {
31            regex_str,
32            regex: OnceCell::new(),
33        }
34    }
35
36    /// Check whether the pattern compiles as a valid regex or not.
37    pub fn try_compile(regex_str: &str) -> Option<Box<dyn Error + Send + Sync + 'static>> {
38        regex_impl::Regex::new(regex_str).err()
39    }
40
41    /// Return the regex pattern.
42    pub fn regex_str(&self) -> &str {
43        &self.regex_str
44    }
45
46    /// Check if the regex matches the given text.
47    pub fn is_match<'t>(
48        &self,
49        text: &'t str,
50        ignore_errors: bool,
51    ) -> Result<bool, ParseSyntaxError> {
52        match self.is_match_failible(text) {
53            Ok(result) => Ok(result),
54            Err(e) => {
55                if ignore_errors {
56                    Ok(false)
57                } else {
58                    Err(ParseSyntaxError::RegexCompileError(
59                        self.regex_str.to_string(),
60                        e.to_string().into(),
61                    ))
62                }
63            }
64        }
65    }
66
67    /// Search for the pattern in the given text from begin/end positions.
68    ///
69    /// If a region is passed, it is used for storing match group positions. The argument allows
70    /// the [`Region`] to be reused between searches, which makes a significant performance
71    /// difference.
72    ///
73    /// [`Region`]: struct.Region.html
74    pub fn search(
75        &self,
76        text: &str,
77        begin: usize,
78        end: usize,
79        region: Option<&mut Region>,
80        ignore_errors: bool,
81    ) -> Result<bool, ParseSyntaxError> {
82        match self.search_failible(text, begin, end, region) {
83            Ok(result) => Ok(result),
84            Err(e) => {
85                if ignore_errors {
86                    Ok(false)
87                } else {
88                    Err(ParseSyntaxError::RegexCompileError(
89                        self.regex_str.to_string(),
90                        e.to_string().into(),
91                    ))
92                }
93            }
94        }
95    }
96
97    /// Check if the regex matches the given text.
98    ///
99    /// In order to be called repetitively when in error, the error message is returned as a &str
100    /// without allocation
101    pub fn is_match_failible<'t>(&self, text: &'t str) -> Result<bool, &str> {
102        match self.regex() {
103            Ok(r) => Ok(r.is_match(text)),
104            Err(e) => Err(e.as_str()),
105        }
106    }
107    /// Search for the pattern in the given text from begin/end positions.
108    ///
109    /// If a region is passed, it is used for storing match group positions. The argument allows
110    /// the [`Region`] to be reused between searches, which makes a significant performance
111    /// difference.
112    ///
113    /// [`Region`]: struct.Region.html
114    pub fn search_failible(
115        &self,
116        text: &str,
117        begin: usize,
118        end: usize,
119        region: Option<&mut Region>,
120    ) -> Result<bool, &str> {
121        match self.regex() {
122            Ok(r) => Ok(r.search(text, begin, end, region.map(|r| &mut r.region))),
123            Err(e) => Err(e.as_str()),
124        }
125    }
126
127    fn regex(&self) -> &Result<regex_impl::Regex, String> {
128        self.regex
129            .get_or_init(|| regex_impl::Regex::new(&self.regex_str).map_err(|e| e.to_string()))
130    }
131}
132
133impl Clone for Regex {
134    fn clone(&self) -> Self {
135        Regex {
136            regex_str: self.regex_str.clone(),
137            regex: OnceCell::new(),
138        }
139    }
140}
141
142impl PartialEq for Regex {
143    fn eq(&self, other: &Regex) -> bool {
144        self.regex_str == other.regex_str
145    }
146}
147
148impl Eq for Regex {}
149
150impl Serialize for Regex {
151    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
152    where
153        S: Serializer,
154    {
155        serializer.serialize_str(&self.regex_str)
156    }
157}
158
159impl<'de> Deserialize<'de> for Regex {
160    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
161    where
162        D: Deserializer<'de>,
163    {
164        let regex_str = String::deserialize(deserializer)?;
165        Ok(Regex::new(regex_str))
166    }
167}
168
169impl Region {
170    pub fn new() -> Self {
171        Self {
172            region: regex_impl::new_region(),
173        }
174    }
175
176    /// Get the start/end positions of the capture group with given index.
177    ///
178    /// If there is no match for that group or the index does not correspond to a group, `None` is
179    /// returned. The index 0 returns the whole match.
180    pub fn pos(&self, index: usize) -> Option<(usize, usize)> {
181        self.region.pos(index)
182    }
183}
184
185impl Default for Region {
186    fn default() -> Self {
187        Self::new()
188    }
189}
190
191#[cfg(feature = "regex-onig")]
192mod regex_impl {
193    pub use onig::Region;
194    use onig::{MatchParam, RegexOptions, SearchOptions, Syntax};
195    use std::error::Error;
196
197    #[derive(Debug)]
198    pub struct Regex {
199        regex: onig::Regex,
200    }
201
202    pub fn new_region() -> Region {
203        Region::with_capacity(8)
204    }
205
206    impl Regex {
207        pub fn new(regex_str: &str) -> Result<Regex, Box<dyn Error + Send + Sync + 'static>> {
208            let result = onig::Regex::with_options(
209                regex_str,
210                RegexOptions::REGEX_OPTION_CAPTURE_GROUP,
211                Syntax::default(),
212            );
213            match result {
214                Ok(regex) => Ok(Regex { regex }),
215                Err(error) => Err(Box::new(error)),
216            }
217        }
218
219        pub fn is_match(&self, text: &str) -> bool {
220            self.regex
221                .match_with_options(text, 0, SearchOptions::SEARCH_OPTION_NONE, None)
222                .is_some()
223        }
224
225        pub fn search(
226            &self,
227            text: &str,
228            begin: usize,
229            end: usize,
230            region: Option<&mut Region>,
231        ) -> bool {
232            let matched = self.regex.search_with_param(
233                text,
234                begin,
235                end,
236                SearchOptions::SEARCH_OPTION_NONE,
237                region,
238                MatchParam::default(),
239            );
240
241            // If there's an error during search, treat it as non-matching.
242            // For example, in case of catastrophic backtracking, onig should
243            // fail with a "retry-limit-in-match over" error eventually.
244            matches!(matched, Ok(Some(_)))
245        }
246    }
247}
248
249// If both regex-fancy and regex-onig are requested, this condition makes regex-onig win.
250#[cfg(all(feature = "regex-fancy", not(feature = "regex-onig")))]
251mod regex_impl {
252    use std::error::Error;
253
254    #[derive(Debug)]
255    pub struct Regex {
256        regex: fancy_regex::Regex,
257    }
258
259    #[derive(Clone, Debug, Eq, PartialEq)]
260    pub struct Region {
261        positions: Vec<Option<(usize, usize)>>,
262    }
263
264    pub fn new_region() -> Region {
265        Region {
266            positions: Vec::with_capacity(8),
267        }
268    }
269
270    impl Regex {
271        pub fn new(regex_str: &str) -> Result<Regex, Box<dyn Error + Send + Sync + 'static>> {
272            let result = fancy_regex::Regex::new(regex_str);
273            match result {
274                Ok(regex) => Ok(Regex { regex }),
275                Err(error) => Err(Box::new(error)),
276            }
277        }
278
279        pub fn is_match(&self, text: &str) -> bool {
280            // Errors are treated as non-matches
281            self.regex.is_match(text).unwrap_or(false)
282        }
283
284        pub fn search(
285            &self,
286            text: &str,
287            begin: usize,
288            end: usize,
289            region: Option<&mut Region>,
290        ) -> bool {
291            // If there's an error during search, treat it as non-matching.
292            // For example, in case of catastrophic backtracking, fancy-regex should
293            // fail with an error eventually.
294            if let Ok(Some(captures)) = self.regex.captures_from_pos(&text[..end], begin) {
295                if let Some(region) = region {
296                    region.init_from_captures(&captures);
297                }
298                true
299            } else {
300                false
301            }
302        }
303    }
304
305    impl Region {
306        fn init_from_captures(&mut self, captures: &fancy_regex::Captures) {
307            self.positions.clear();
308            for i in 0..captures.len() {
309                let pos = captures.get(i).map(|m| (m.start(), m.end()));
310                self.positions.push(pos);
311            }
312        }
313
314        pub fn pos(&self, i: usize) -> Option<(usize, usize)> {
315            if i < self.positions.len() {
316                self.positions[i]
317            } else {
318                None
319            }
320        }
321    }
322}
323
324#[cfg(test)]
325mod tests {
326    use super::*;
327
328    #[test]
329    fn caches_compiled_regex() {
330        let regex = Regex::new(String::from(r"\w+"));
331
332        assert!(regex.regex.get().is_none());
333        assert!(regex.is_match("test", false).unwrap());
334        assert!(regex.regex.get().is_some());
335    }
336
337    #[test]
338    fn serde_as_string() {
339        let pattern: Regex = serde_json::from_str("\"just a string\"").unwrap();
340        assert_eq!(pattern.regex_str(), "just a string");
341        let back_to_str = serde_json::to_string(&pattern).unwrap();
342        assert_eq!(back_to_str, "\"just a string\"");
343    }
344}