intuicio_parser/
regex.rs

1use std::{cell::RefCell, collections::HashMap, sync::Arc};
2
3use crate::{
4    ParseResult, Parser, ParserExt, ParserHandle, ParserNoValue, ParserOutput, ParserRegistry,
5};
6
7pub mod shorthand {
8    use super::*;
9    use crate::shorthand::map;
10
11    pub fn regex(pattern: impl AsRef<str>) -> ParserHandle {
12        RegexParser::new(pattern).into_handle()
13    }
14
15    pub fn regex_capture(pattern: impl AsRef<str>, capture: impl ToString) -> ParserHandle {
16        RegexParser::new_capture(pattern, capture).into_handle()
17    }
18
19    pub fn any() -> ParserHandle {
20        regex(r".")
21    }
22
23    pub fn nl() -> ParserHandle {
24        regex(r"[\r\n]")
25    }
26
27    pub fn digit_hex() -> ParserHandle {
28        regex(r"[0-9a-fA-F]&")
29    }
30
31    pub fn digit() -> ParserHandle {
32        regex(r"\d")
33    }
34
35    pub fn number_int_pos() -> ParserHandle {
36        regex(r"\d+")
37    }
38
39    pub fn number_int() -> ParserHandle {
40        regex(r"-?\d+")
41    }
42
43    pub fn number_float() -> ParserHandle {
44        regex(r"-?\d+(\.\d+(e-?\d+)?)?")
45    }
46
47    pub fn alphanum() -> ParserHandle {
48        regex(r"\w")
49    }
50
51    pub fn alpha_low() -> ParserHandle {
52        regex(r"[a-z]")
53    }
54
55    pub fn alpha_up() -> ParserHandle {
56        regex(r"[A-Z]")
57    }
58
59    pub fn alpha() -> ParserHandle {
60        regex(r"[a-zA-Z]")
61    }
62
63    pub fn word() -> ParserHandle {
64        regex(r"\w+")
65    }
66
67    pub fn string(open: &str, close: &str) -> ParserHandle {
68        let open = open.escape_unicode().to_string();
69        let close = close.escape_unicode().to_string();
70        let pattern = format!("{open}(?<content>[^{close}]*){close}");
71        map(regex_capture(pattern, "content"), move |value: String| {
72            snailquote::unescape(&value).unwrap()
73        })
74    }
75
76    pub fn id_start() -> ParserHandle {
77        regex(r"[a-zA-Z_]")
78    }
79
80    pub fn id_continue() -> ParserHandle {
81        regex(r"[0-9a-zA-Z_]*")
82    }
83
84    pub fn id() -> ParserHandle {
85        regex(r"[a-zA-Z_][0-9a-zA-Z_]*")
86    }
87
88    pub fn ws() -> ParserHandle {
89        WhiteSpaceParser::default().into_handle()
90    }
91
92    pub fn ows() -> ParserHandle {
93        OptionalWhiteSpaceParser::default().into_handle()
94    }
95}
96
97thread_local! {
98    static REGEX_CACHE: RefCell<HashMap<String, Arc<regex::Regex>>> = Default::default();
99}
100
101#[derive(Clone)]
102pub struct RegexParser {
103    regex: Arc<regex::Regex>,
104    capture: Option<String>,
105}
106
107impl RegexParser {
108    pub fn new(pattern: impl AsRef<str>) -> Self {
109        let pattern = pattern.as_ref();
110        REGEX_CACHE.with_borrow_mut(|cache| {
111            if let Some(cached) = cache.get(pattern) {
112                return Self {
113                    regex: cached.clone(),
114                    capture: None,
115                };
116            }
117            let regex = Arc::new(
118                regex::Regex::new(&format!(r"^{}", pattern)).expect("Expected valid regex"),
119            );
120            cache.insert(pattern.to_string(), regex.clone());
121            Self {
122                regex,
123                capture: None,
124            }
125        })
126    }
127
128    pub fn new_capture(pattern: impl AsRef<str>, capture: impl ToString) -> Self {
129        let pattern = pattern.as_ref();
130        let capture = capture.to_string();
131        REGEX_CACHE.with_borrow_mut(|cache| {
132            if let Some(cached) = cache.get(pattern) {
133                return Self {
134                    regex: cached.clone(),
135                    capture: Some(capture),
136                };
137            }
138            let regex = Arc::new(
139                regex::Regex::new(&format!(r"^{}", pattern)).expect("Expected valid regex"),
140            );
141            cache.insert(pattern.to_string(), regex.clone());
142            Self {
143                regex,
144                capture: Some(capture),
145            }
146        })
147    }
148}
149
150impl Parser for RegexParser {
151    fn parse<'a>(&self, _: &ParserRegistry, input: &'a str) -> ParseResult<'a> {
152        if let Some(capture) = self.capture.as_deref() {
153            if let Some(cap) = self.regex.captures(input) {
154                Ok((
155                    &input[cap.get(0).unwrap().end()..],
156                    ParserOutput::new(
157                        cap.name(capture)
158                            .map(|mat| mat.as_str())
159                            .unwrap_or("")
160                            .to_owned(),
161                    )
162                    .ok()
163                    .unwrap(),
164                ))
165            } else {
166                Err(format!(
167                    "Expected regex match '{}' with capture: '{}'",
168                    self.regex, capture
169                )
170                .into())
171            }
172        } else if let Some(mat) = self.regex.find(input) {
173            Ok((
174                &input[mat.end()..],
175                ParserOutput::new(mat.as_str().to_owned()).ok().unwrap(),
176            ))
177        } else {
178            Err(format!("Expected regex match '{}'", self.regex).into())
179        }
180    }
181}
182
183#[derive(Clone)]
184pub struct WhiteSpaceParser(RegexParser);
185
186impl Default for WhiteSpaceParser {
187    fn default() -> Self {
188        Self(RegexParser::new(r"\s+"))
189    }
190}
191
192impl Parser for WhiteSpaceParser {
193    fn parse<'a>(&self, registry: &ParserRegistry, input: &'a str) -> ParseResult<'a> {
194        match self.0.parse(registry, input) {
195            Ok((rest, _)) => Ok((rest, ParserOutput::new(ParserNoValue).ok().unwrap())),
196            Err(error) => Err(error),
197        }
198    }
199}
200
201#[derive(Clone)]
202pub struct OptionalWhiteSpaceParser(RegexParser);
203
204impl Default for OptionalWhiteSpaceParser {
205    fn default() -> Self {
206        Self(RegexParser::new(r"\s*"))
207    }
208}
209
210impl Parser for OptionalWhiteSpaceParser {
211    fn parse<'a>(&self, registry: &ParserRegistry, input: &'a str) -> ParseResult<'a> {
212        match self.0.parse(registry, input) {
213            Ok((rest, _)) => Ok((rest, ParserOutput::new(ParserNoValue).ok().unwrap())),
214            Err(error) => Err(error),
215        }
216    }
217}
218
219#[cfg(test)]
220mod tests {
221    use crate::{
222        ParserRegistry,
223        regex::{OptionalWhiteSpaceParser, RegexParser, WhiteSpaceParser},
224        shorthand::{ows, regex, regex_capture, string, ws},
225    };
226
227    fn is_async<T: Send + Sync>() {}
228
229    #[test]
230    fn test_regex() {
231        is_async::<RegexParser>();
232        is_async::<WhiteSpaceParser>();
233        is_async::<OptionalWhiteSpaceParser>();
234
235        let registry = ParserRegistry::default();
236
237        let keyword = regex_capture(r"\s+(?<name>\w+)\s+", "name");
238        let (rest, result) = keyword.parse(&registry, " foo ").unwrap();
239        assert_eq!(rest, "");
240        assert_eq!(result.read::<String>().unwrap().as_str(), "foo");
241
242        let keyword = string("`", "`");
243        let (rest, result) = keyword.parse(&registry, "`Hello World!`").unwrap();
244        assert_eq!(rest, "");
245        assert_eq!(result.read::<String>().unwrap().as_str(), "Hello World!");
246
247        let keyword = string("(", ")");
248        let (rest, result) = keyword.parse(&registry, "(Hello World!)").unwrap();
249        assert_eq!(rest, "");
250        assert_eq!(result.read::<String>().unwrap().as_str(), "Hello World!");
251
252        let keyword = regex(r"\w+");
253        assert_eq!(keyword.parse(&registry, "foo bar").unwrap().0, " bar");
254
255        let ws = ws();
256        assert_eq!(ws.parse(&registry, "   \t  \n").unwrap().0, "");
257        assert_eq!(
258            format!("{}", ws.parse(&registry, "a").err().unwrap()),
259            "Expected regex match '^\\s+'"
260        );
261
262        let ows = ows();
263        assert_eq!(ows.parse(&registry, "   \t  \n").unwrap().0, "");
264        assert_eq!(ows.parse(&registry, "foo").unwrap().0, "foo");
265    }
266}