intuicio_parser/
regex.rs

1use crate::{
2    ParseResult, Parser, ParserExt, ParserHandle, ParserNoValue, ParserOutput, ParserRegistry,
3};
4use std::{cell::RefCell, collections::HashMap, sync::Arc};
5
6pub mod shorthand {
7    use super::*;
8    use crate::shorthand::map;
9
10    pub fn regex(pattern: impl AsRef<str>) -> ParserHandle {
11        RegexParser::new(pattern).into_handle()
12    }
13
14    pub fn regex_capture(pattern: impl AsRef<str>, capture: impl ToString) -> ParserHandle {
15        RegexParser::new_capture(pattern, capture).into_handle()
16    }
17
18    pub fn any() -> ParserHandle {
19        regex(r".")
20    }
21
22    pub fn nl() -> ParserHandle {
23        regex(r"[\r\n]")
24    }
25
26    pub fn digit_hex() -> ParserHandle {
27        regex(r"[0-9a-fA-F]&")
28    }
29
30    pub fn digit() -> ParserHandle {
31        regex(r"\d")
32    }
33
34    pub fn number_int_pos() -> ParserHandle {
35        regex(r"\d+")
36    }
37
38    pub fn number_int() -> ParserHandle {
39        regex(r"-?\d+")
40    }
41
42    pub fn number_float() -> ParserHandle {
43        regex(r"-?\d+(\.\d+(e-?\d+)?)?")
44    }
45
46    pub fn alphanum() -> ParserHandle {
47        regex(r"\w")
48    }
49
50    pub fn alpha_low() -> ParserHandle {
51        regex(r"[a-z]")
52    }
53
54    pub fn alpha_up() -> ParserHandle {
55        regex(r"[A-Z]")
56    }
57
58    pub fn alpha() -> ParserHandle {
59        regex(r"[a-zA-Z]")
60    }
61
62    pub fn word() -> ParserHandle {
63        regex(r"\w+")
64    }
65
66    pub fn string(open: &str, close: &str) -> ParserHandle {
67        let open = open.escape_unicode().to_string();
68        let close = close.escape_unicode().to_string();
69        let pattern = format!("{open}(?<content>[^{close}]*){close}");
70        map(regex_capture(pattern, "content"), move |value: String| {
71            snailquote::unescape(&value).unwrap()
72        })
73    }
74
75    pub fn id_start() -> ParserHandle {
76        regex(r"[a-zA-Z_]")
77    }
78
79    pub fn id_continue() -> ParserHandle {
80        regex(r"[0-9a-zA-Z_]*")
81    }
82
83    pub fn id() -> ParserHandle {
84        regex(r"[a-zA-Z_][0-9a-zA-Z_]*")
85    }
86
87    pub fn ws() -> ParserHandle {
88        WhiteSpaceParser::default().into_handle()
89    }
90
91    pub fn ows() -> ParserHandle {
92        OptionalWhiteSpaceParser::default().into_handle()
93    }
94}
95
96thread_local! {
97    static REGEX_CACHE: RefCell<HashMap<String, Arc<regex::Regex>>> = Default::default();
98}
99
100#[derive(Clone)]
101pub struct RegexParser {
102    regex: Arc<regex::Regex>,
103    capture: Option<String>,
104}
105
106impl RegexParser {
107    pub fn new(pattern: impl AsRef<str>) -> Self {
108        let pattern = pattern.as_ref();
109        REGEX_CACHE.with_borrow_mut(|cache| {
110            if let Some(cached) = cache.get(pattern) {
111                return Self {
112                    regex: cached.clone(),
113                    capture: None,
114                };
115            }
116            let regex = Arc::new(
117                regex::Regex::new(&format!(r"^{}", pattern)).expect("Expected valid regex"),
118            );
119            cache.insert(pattern.to_string(), regex.clone());
120            Self {
121                regex,
122                capture: None,
123            }
124        })
125    }
126
127    pub fn new_capture(pattern: impl AsRef<str>, capture: impl ToString) -> Self {
128        let pattern = pattern.as_ref();
129        let capture = capture.to_string();
130        REGEX_CACHE.with_borrow_mut(|cache| {
131            if let Some(cached) = cache.get(pattern) {
132                return Self {
133                    regex: cached.clone(),
134                    capture: Some(capture),
135                };
136            }
137            let regex = Arc::new(
138                regex::Regex::new(&format!(r"^{}", pattern)).expect("Expected valid regex"),
139            );
140            cache.insert(pattern.to_string(), regex.clone());
141            Self {
142                regex,
143                capture: Some(capture),
144            }
145        })
146    }
147}
148
149impl Parser for RegexParser {
150    fn parse<'a>(&self, _: &ParserRegistry, input: &'a str) -> ParseResult<'a> {
151        if let Some(capture) = self.capture.as_deref() {
152            if let Some(cap) = self.regex.captures(input) {
153                Ok((
154                    &input[cap.get(0).unwrap().end()..],
155                    ParserOutput::new(
156                        cap.name(capture)
157                            .map(|mat| mat.as_str())
158                            .unwrap_or("")
159                            .to_owned(),
160                    )
161                    .ok()
162                    .unwrap(),
163                ))
164            } else {
165                Err(format!(
166                    "Expected regex match '{}' with capture: '{}'",
167                    self.regex, capture
168                )
169                .into())
170            }
171        } else if let Some(mat) = self.regex.find(input) {
172            Ok((
173                &input[mat.end()..],
174                ParserOutput::new(mat.as_str().to_owned()).ok().unwrap(),
175            ))
176        } else {
177            Err(format!("Expected regex match '{}'", self.regex).into())
178        }
179    }
180}
181
182#[derive(Clone)]
183pub struct WhiteSpaceParser(RegexParser);
184
185impl Default for WhiteSpaceParser {
186    fn default() -> Self {
187        Self(RegexParser::new(r"\s+"))
188    }
189}
190
191impl Parser for WhiteSpaceParser {
192    fn parse<'a>(&self, registry: &ParserRegistry, input: &'a str) -> ParseResult<'a> {
193        match self.0.parse(registry, input) {
194            Ok((rest, _)) => Ok((rest, ParserOutput::new(ParserNoValue).ok().unwrap())),
195            Err(error) => Err(error),
196        }
197    }
198}
199
200#[derive(Clone)]
201pub struct OptionalWhiteSpaceParser(RegexParser);
202
203impl Default for OptionalWhiteSpaceParser {
204    fn default() -> Self {
205        Self(RegexParser::new(r"\s*"))
206    }
207}
208
209impl Parser for OptionalWhiteSpaceParser {
210    fn parse<'a>(&self, registry: &ParserRegistry, input: &'a str) -> ParseResult<'a> {
211        match self.0.parse(registry, input) {
212            Ok((rest, _)) => Ok((rest, ParserOutput::new(ParserNoValue).ok().unwrap())),
213            Err(error) => Err(error),
214        }
215    }
216}
217
218#[cfg(test)]
219mod tests {
220    use crate::{
221        ParserRegistry,
222        regex::{OptionalWhiteSpaceParser, RegexParser, WhiteSpaceParser},
223        shorthand::{ows, regex, regex_capture, string, ws},
224    };
225
226    fn is_async<T: Send + Sync>() {}
227
228    #[test]
229    fn test_regex() {
230        is_async::<RegexParser>();
231        is_async::<WhiteSpaceParser>();
232        is_async::<OptionalWhiteSpaceParser>();
233
234        let registry = ParserRegistry::default();
235
236        let keyword = regex_capture(r"\s+(?<name>\w+)\s+", "name");
237        let (rest, result) = keyword.parse(&registry, " foo ").unwrap();
238        assert_eq!(rest, "");
239        assert_eq!(result.read::<String>().unwrap().as_str(), "foo");
240
241        let keyword = string("`", "`");
242        let (rest, result) = keyword.parse(&registry, "`Hello World!`").unwrap();
243        assert_eq!(rest, "");
244        assert_eq!(result.read::<String>().unwrap().as_str(), "Hello World!");
245
246        let keyword = string("(", ")");
247        let (rest, result) = keyword.parse(&registry, "(Hello World!)").unwrap();
248        assert_eq!(rest, "");
249        assert_eq!(result.read::<String>().unwrap().as_str(), "Hello World!");
250
251        let keyword = regex(r"\w+");
252        assert_eq!(keyword.parse(&registry, "foo bar").unwrap().0, " bar");
253
254        let ws = ws();
255        assert_eq!(ws.parse(&registry, "   \t  \n").unwrap().0, "");
256        assert_eq!(
257            format!("{}", ws.parse(&registry, "a").err().unwrap()),
258            "Expected regex match '^\\s+'"
259        );
260
261        let ows = ows();
262        assert_eq!(ows.parse(&registry, "   \t  \n").unwrap().0, "");
263        assert_eq!(ows.parse(&registry, "foo").unwrap().0, "foo");
264    }
265}