graph_auth/resource_path/string/
lexer.rs

1use nom::{InputTake, Offset};
2use nom::branch::alt;
3use nom::bytes::complete::{is_not, tag, take_while, take_while1, take_while_m_n};
4use nom::character::complete::{anychar, char, multispace1};
5use nom::combinator::{all_consuming, map, map_opt, map_res, recognize, value, verify};
6use nom::multi::{fold_many0, many1};
7use nom::number::complete::double;
8use nom::sequence::{delimited, preceded};
9use serde_json::Number;
10
11use crate::resource_path::string::lexer_utils::{IResult, LexerError, LexerState, LocatedSpan};
12use crate::resource_path::string::token::{Token, TokenValue};
13
14
15use super::range_ex::AsRange;
16
17/// Creates a parser function for a token that maps 1:1 with its token value.
18macro_rules! tag_token {
19    ($name:ident, $repr:literal, $token_value:expr) => {
20        fn $name(input: LocatedSpan) -> IResult<Token> {
21            map(tag($repr), |span: LocatedSpan| {
22                Token::new(span, $token_value)
23            })(input)
24        }
25    };
26}
27
28/// A representation of a string fragment used to construct a string literal.
29#[derive(Debug, Clone, PartialEq, Eq)]
30enum StringFragment<'a> {
31    /// A string literal containing no quotes or backslashes.
32    Literal(LocatedSpan<'a>),
33
34    /// An escaped character fragment.
35    EscapedChar(char),
36
37    /// Whitespace ignored from the string's representation.
38    EscapedWhitespace,
39}
40
41/// Returns the value of the inner parser if it succeeds. Otherwise, a `LexerError`
42/// containing the provided error message is reported to the inner state of the lexer.
43fn expect<'a, F, E, T>(
44    mut parser: F,
45    err_msg: E,
46) -> impl FnMut(LocatedSpan<'a>) -> IResult<Option<T>>
47where
48    F: FnMut(LocatedSpan<'a>) -> IResult<T>,
49    E: ToString,
50{
51    use nom::error::Error as NomError;
52    move |input| match parser(input) {
53        Ok((remaining, output)) => Ok((remaining, Some(output))),
54        Err(nom::Err::Error(NomError { input, code: _ }))
55        | Err(nom::Err::Failure(NomError { input, code: _ })) => {
56            let err = LexerError(input.as_range(), err_msg.to_string());
57            input.extra.report_error(err);
58
59            Ok((input, None))
60        }
61        Err(err) => Err(err),
62    }
63}
64
65fn lit_bool(input: LocatedSpan) -> IResult<Token> {
66    alt((
67        map(tag("false"), |span: LocatedSpan| {
68            Token::new(span, TokenValue::LitBool(false))
69        }),
70        map(tag("true"), |span: LocatedSpan| {
71            Token::new(span, TokenValue::LitBool(true))
72        }),
73    ))(input)
74}
75
76/// Parses an identifier.
77fn ident(input: LocatedSpan) -> IResult<Token> {
78    let first = verify(anychar, |c| c.is_ascii_alphabetic() || *c == '_');
79    let rest = take_while(|c: char| c.is_ascii_alphabetic() || c.is_ascii_digit() || c == '_');
80    let ident = recognize(preceded(first, rest));
81    map(ident, |span: LocatedSpan| {
82        let fragment = span.fragment().to_string();
83        Token::new(span, TokenValue::Ident(fragment))
84    })(input)
85}
86
87/// Parse a unicode sequence, of the form u{XXXX}, where XXXX is 1 to 6
88/// hexadecimal numerals.
89fn lit_str_unicode_char(input: LocatedSpan) -> IResult<char> {
90    let parse_hex = take_while_m_n(1, 6, |c: char| c.is_ascii_hexdigit());
91    // FIXME Figure out a way to keep correct span here.
92    let parse_delim_hex = preceded(
93        char('u'),
94        delimited(
95            char('{'),
96            expect(parse_hex, "expected 1-6 hex digits"),
97            expect(char('}'), "expected closing brace"),
98        ),
99    );
100    let parse_u32 = map_res(parse_delim_hex, move |hex| match hex {
101        None => Err("cannot parse number"),
102        Some(hex) => match u32::from_str_radix(hex.fragment(), 16) {
103            Ok(val) => Ok(val),
104            Err(_) => Err("invalid number"),
105        },
106    });
107    map_opt(parse_u32, std::char::from_u32)(input)
108}
109
110/// Parse an escaped character.
111fn lit_str_escaped_char(input: LocatedSpan) -> IResult<char> {
112    preceded(
113        char('\\'),
114        alt((
115            lit_str_unicode_char,
116            value('\n', char('n')),
117            value('\r', char('r')),
118            value('\t', char('t')),
119            value('\\', char('\\')),
120            value('/', char('/')),
121            value('"', char('"')),
122        )),
123    )(input)
124}
125
126/// Parse a backslash, followed by any amount of whitespace. This is used later
127/// to discard any escaped whitespace.
128fn lit_str_escaped_whitespace(input: LocatedSpan) -> IResult<LocatedSpan> {
129    preceded(char('\\'), multispace1)(input)
130}
131
132/// Parse a non-empty block of text that doesn't include \ or "
133fn lit_str_literal(input: LocatedSpan) -> IResult<LocatedSpan> {
134    let not_quote_slash = is_not("\"\\");
135    verify(not_quote_slash, |s: &LocatedSpan| !s.is_empty())(input)
136}
137
138/// Parses a single kind of string fragment as described by the `StringFragment
139/// enumeration. This can be a string literal without any quotes or backslashes,
140/// an escaped character, or ignored whitespace.
141fn lit_str_fragment(input: LocatedSpan) -> IResult<StringFragment> {
142    alt((
143        map(lit_str_literal, StringFragment::Literal),
144        map(lit_str_escaped_char, StringFragment::EscapedChar),
145        value(
146            StringFragment::EscapedWhitespace,
147            lit_str_escaped_whitespace,
148        ),
149    ))(input)
150}
151
152/// Parses and constructs a string literal from its fragments.
153///
154/// # Notes
155///
156/// This function uses heap allocation to construct a `String`.
157fn lit_str(input: LocatedSpan) -> IResult<Token> {
158    let build_string = fold_many0(lit_str_fragment, String::new, |mut string, fragment| {
159        match fragment {
160            StringFragment::Literal(s) => string.push_str(s.fragment()),
161            StringFragment::EscapedChar(c) => string.push(c),
162            StringFragment::EscapedWhitespace => {}
163        }
164        string
165    });
166
167    let (remainder, s) = delimited(
168        char('"'),
169        build_string,
170        expect(char('"'), "expected closing quote"),
171    )(input.clone())?;
172    let span_offset = input.offset(&remainder);
173    let span = input.take(span_offset);
174    Ok((remainder, Token::new(span, TokenValue::LitStr(s))))
175}
176
177/// Parses a numeric literal into either an `f64`, `u64`, or `i64`.
178fn lit_num(input: LocatedSpan) -> IResult<Token> {
179    let num = map_opt(double, |v: f64| {
180        let n = if v == (v as u64) as f64 {
181            Some(Number::from(v as u64))
182        } else if v < 0.0 && v == (v as i64) as f64 {
183            Some(Number::from(v as i64))
184        } else {
185            Number::from_f64(v)
186        };
187        n.map(TokenValue::LitNum)
188    });
189
190    map(num, |tv: TokenValue| Token::new(input.clone(), tv))(input.clone())
191}
192
193/// Parses ASCII whitespace.
194fn whitespace(input: LocatedSpan) -> IResult<Token> {
195    let ws = take_while1(|c: char| c.is_ascii_whitespace());
196    map(ws, |span: LocatedSpan| {
197        Token::new(span, TokenValue::Whitespace)
198    })(input)
199}
200
201tag_token!(scope, "::", TokenValue::Scope);
202tag_token!(colon, ":", TokenValue::Colon);
203tag_token!(wildcard, "*", TokenValue::Wildcard);
204tag_token!(lcurly, "{", TokenValue::LCurly);
205tag_token!(rcurly, "}", TokenValue::RCurly);
206tag_token!(lparen, "(", TokenValue::LParen);
207tag_token!(rparen, ")", TokenValue::RParen);
208tag_token!(comma, ",", TokenValue::Comma);
209
210fn expr(input: LocatedSpan) -> IResult<Vec<Token>> {
211    let tokens = many1(alt((
212        lit_bool, ident, lit_str, lit_num, whitespace, scope, colon, wildcard, lcurly, rcurly,
213        lparen, rparen, comma,
214    )));
215    let (remainder, token_list) = expect(all_consuming(tokens), "expected end-of-file")(input)?;
216    Ok((remainder, token_list.unwrap_or_default()))
217}
218
219/// Takes a Resource Path string representation and returns a vector of tokens.
220///
221/// # Notes
222///
223/// Heap allocation will occur for two token types: identifiers and string
224/// literals.
225fn tokenize<'a>(raw: &'a str) -> (Vec<Token>, Vec<LexerError>) {
226    let input = LocatedSpan::<'a>::new_extra(raw, LexerState::new());
227    let (remainder, tokens) = expr(input).expect("parser cannot fail");
228    (tokens, remainder.extra.0.into_inner())
229}
230
231#[cfg(test)]
232mod test {
233    use std::ops::Range;
234
235    use rstest::rstest;
236
237    use crate::resource_path::string::lexer::tokenize;
238    use crate::resource_path::string::lexer_utils::LexerError;
239    use crate::resource_path::string::token::TokenValue;
240
241    #[rstest]
242    #[case("::", 0..2, TokenValue::Scope)]
243    #[case("*", 0..1, TokenValue::Wildcard)]
244    #[case("{", 0..1, TokenValue::LCurly)]
245    #[case("}", 0..1, TokenValue::RCurly)]
246    #[case("(", 0..1, TokenValue::LParen)]
247    #[case(")", 0..1, TokenValue::RParen)]
248    #[case(",", 0..1, TokenValue::Comma)]
249    #[case(":", 0..1, TokenValue::Colon)]
250    #[case("false", 0..5, TokenValue::LitBool(false))]
251    #[case("true", 0..4, TokenValue::LitBool(true))]
252    fn tag_token(#[case] raw: &str, #[case] range: Range<usize>, #[case] tv: TokenValue) {
253        let (tokens, errors) = tokenize(raw);
254        assert!(errors.is_empty());
255        assert_eq!(tokens.len(), 1);
256
257        let actual = tokens.first().unwrap();
258        assert_eq!(actual.span.range, range, "wrong range for: {}", raw);
259        assert_eq!(&actual.value, &tv);
260    }
261
262    #[rstest]
263    #[case("\"\"", "")]
264    #[case("\"a\"", "a")]
265    #[case("\"a\"", "a")]
266    // String containing an escaped quote.
267    #[case("\"a\\\"\"", "a\"")]
268    // String containing an escaped newline.
269    #[case("\"a\\nb\"", "a\nb")]
270    #[case("\"\\u{61}bc\"", "abc")]
271    // String containing escaped whitespace.
272    #[case("\"a\\          \\nb\"", "a\nb")]
273    fn lit_str(#[case] raw: &str, #[case] expected: &str) {
274        let (tokens, errors) = tokenize(raw);
275        assert!(errors.is_empty());
276        assert_eq!(tokens.len(), 1);
277
278        let actual = tokens.first().unwrap();
279        assert_eq!(&actual.value, &TokenValue::LitStr(expected.to_string()));
280    }
281
282    #[test]
283    fn unclosed_str_lit() {
284        let (tokens, errors) = tokenize("\"abc");
285        assert_eq!(tokens.len(), 1);
286
287        let actual = tokens.first().unwrap();
288        assert_eq!(&actual.value, &TokenValue::LitStr("abc".to_string()));
289
290        let LexerError(err_range, err_msg) = errors.first().unwrap();
291        assert_eq!(err_range, &(4..4));
292        assert_eq!(err_msg, &"expected closing quote");
293    }
294
295    #[test]
296    fn invalid_unicode_escape() {
297        let (tokens, errors) = tokenize("\"\\u{61\"");
298        assert_eq!(tokens.len(), 1);
299
300        let actual = tokens.first().unwrap();
301        assert_eq!(&actual.value, &TokenValue::LitStr("a".to_string()));
302
303        let LexerError(err_range, err_msg) = errors.first().unwrap();
304        assert_eq!(err_range, &(6..7));
305        assert_eq!(err_msg, &"expected closing brace");
306    }
307
308    #[rstest]
309    #[case("123", true, true, false)]
310    #[case("-12", false, true, false)]
311    #[case("0.12", false, false, true)]
312    #[case("-0.1", false, false, true)]
313    // 2^65 - too large to fit in u64
314    #[case("36893488147419103232", false, false, true)]
315    // -2^65 - again, too large to fit in an i64
316    #[case("-36893488147419103232", false, false, true)]
317    fn lit_num(
318        #[case] raw: &str,
319        #[case] is_u64: bool,
320        #[case] is_i64: bool,
321        #[case] is_f64: bool,
322    ) {
323        let (tokens, errors) = tokenize(raw);
324        assert_eq!(tokens.len(), 1);
325        assert!(errors.is_empty());
326
327        let actual = tokens.first().unwrap();
328        match &actual.value {
329            TokenValue::LitNum(v) => {
330                assert_eq!(v.is_u64(), is_u64);
331                assert_eq!(v.is_i64(), is_i64);
332                assert_eq!(v.is_f64(), is_f64);
333            }
334            _ => panic!("token value must be a numeric literal"),
335        }
336    }
337
338    #[test]
339    fn simple() {
340        let (tokens, errors) = tokenize("a(b: true)::{c, d}");
341        assert!(errors.is_empty());
342
343        let token_values = [
344            TokenValue::Ident("a".to_string()),
345            TokenValue::LParen,
346            TokenValue::Ident("b".to_string()),
347            TokenValue::Colon,
348            TokenValue::Whitespace,
349            TokenValue::LitBool(true),
350            TokenValue::RParen,
351            TokenValue::Scope,
352            TokenValue::LCurly,
353            TokenValue::Ident("c".to_string()),
354            TokenValue::Comma,
355            TokenValue::Whitespace,
356            TokenValue::Ident("d".to_string()),
357            TokenValue::RCurly,
358        ];
359
360        assert_eq!(
361            tokens.into_iter().map(|t| t.value).collect::<Vec<_>>(),
362            token_values.to_vec(),
363        );
364    }
365}