gq_core/
lexer.rs

1use core::fmt;
2use std::fmt::{Display, Formatter};
3
4use logos::Logos;
5use thiserror::Error;
6
7#[derive(Error, Clone, Default, Debug, PartialEq)]
8pub enum Error {
9    #[default]
10    #[error("Unknown character")]
11    UnknownCharacter,
12}
13
14#[derive(Clone, Debug, Logos, PartialEq)]
15#[logos(skip r"[ \t\r\n\f]+")]
16#[logos(error = Error)]
17pub enum Token {
18    #[token("{")]
19    LBrace,
20    #[token("}")]
21    RBrace,
22    #[token("(")]
23    LParen,
24    #[token(")")]
25    RParen,
26    #[token(".")]
27    Dot,
28    #[token(":")]
29    Colon,
30    #[token(",")]
31    Comma,
32    #[token("=")]
33    Equal,
34    #[token("!=")]
35    NotEqual,
36    #[token(">")]
37    Greater,
38    #[token(">=")]
39    GreaterEqual,
40    #[token("<")]
41    Less,
42    #[token("<=")]
43    LessEqual,
44    #[token("~")]
45    Tilde,
46    #[token("!~")]
47    NotTilde,
48    // This regex does not support keys starting with '-' or numbers
49    #[regex(r"[a-zA-Z_][\w-]*", |lex| lex.slice().to_string())]
50    Identifier(String),
51    // Values
52    #[token("false", |_| false)]
53    #[token("true", |_| true)]
54    Bool(bool),
55    // Got from https://logos.maciej.codes/examples/json.html, didn't even mind to understand it
56    // TODO: the unwrap is ok here? the regex should be valid for the f64 parsing
57    #[regex(r"-?(?:0|[1-9]\d*)(?:\.\d+)?(?:[eE][+-]?\d+)?", |lex| lex.slice().parse::<f64>().unwrap())]
58    Number(f64),
59    // This string follows [RFC 8259](https://datatracker.ietf.org/doc/html/rfc8259)
60    // Single quoted strings are not allowed
61    #[regex(r#""(?:[^"]|\\")*""#, |lex| {
62        // TODO: handle the unquote error with custom lexer errors
63        // TODO: improve slicing?
64        let target_slice = &lex.slice()[1..lex.slice().len() - 1];
65        escape8259::unescape(target_slice).expect("Error while unquoting")
66    }
67    )]
68    String(String),
69    #[token("null")]
70    Null,
71    EOF,
72}
73
74impl Display for Token {
75    fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
76        match self {
77            Token::LBrace => '{'.fmt(f),
78            Token::RBrace => '}'.fmt(f),
79            Token::LParen => '('.fmt(f),
80            Token::RParen => ')'.fmt(f),
81            Token::Dot => '.'.fmt(f),
82            Token::Colon => ':'.fmt(f),
83            Token::Comma => ','.fmt(f),
84            Token::Equal => '='.fmt(f),
85            Token::NotEqual => "!=".fmt(f),
86            Token::Greater => '>'.fmt(f),
87            Token::GreaterEqual => ">=".fmt(f),
88            Token::Less => '<'.fmt(f),
89            Token::LessEqual => "<=".fmt(f),
90            Token::Tilde => '~'.fmt(f),
91            Token::NotTilde => "!~".fmt(f),
92            Token::Identifier(key) => key.fmt(f),
93            Token::Bool(b) => b.fmt(f),
94            Token::Number(n) => n.fmt(f),
95            Token::String(s) => s.fmt(f),
96            Token::Null => "null".fmt(f),
97            Token::EOF => "EOF".fmt(f),
98        }
99    }
100}
101
102#[cfg(test)]
103mod tests {
104    use super::*;
105    use rstest::rstest;
106
107    /// Gets the next token and asserts that there are no more tokens left
108    fn get_next_token(input: &str) -> Token {
109        let mut lexer = Token::lexer(input);
110        let token = lexer
111            .next()
112            .expect("There should be at least one token")
113            .expect("Error parsing token");
114        assert_eq!(lexer.next(), None);
115
116        token
117    }
118
119    #[rstest]
120    #[case::l_brace("{", Token::LBrace)]
121    #[case::r_brace("}", Token::RBrace)]
122    #[case::l_paren("(", Token::LParen)]
123    #[case::r_paren(")", Token::RParen)]
124    #[case::dot(".", Token::Dot)]
125    #[case::colon(":", Token::Colon)]
126    #[case::comma(",", Token::Comma)]
127    #[case::equal("=", Token::Equal)]
128    #[case::not_equal("!=", Token::NotEqual)]
129    #[case::greater(">", Token::Greater)]
130    #[case::greater_equal(">=", Token::GreaterEqual)]
131    #[case::less("<", Token::Less)]
132    #[case::less_equal("<=", Token::LessEqual)]
133    #[case::tilde("~", Token::Tilde)]
134    #[case::not_tilde("!~", Token::NotTilde)]
135    #[case::true_token("true", Token::Bool(true))]
136    #[case::false_token("false", Token::Bool(false))]
137    #[case::null("null", Token::Null)]
138    fn simple_token_parses(#[case] input: &str, #[case] expected: Token) {
139        let token = get_next_token(input);
140        assert_eq!(token, expected);
141    }
142
143    #[rstest]
144    #[case::positive("5", 5.0)]
145    #[case::negative("-5", -5.0)]
146    #[case::float("5.5", 5.5)]
147    #[case::negative_float("-5.5", -5.5)]
148    #[case::float_with_exponent("5.5e5", 5.5e5)]
149    #[case::float_with_negative_exponent("5.5e-5", 5.5e-5)]
150    #[case::float_with_positive_exponent("5.5e+5", 5.5e5)]
151    #[case::float_with_uppercase_exponent("5.5E5", 5.5e5)]
152    #[case::float_with_uppercase_positive_exponent("5.5E+5", 5.5e5)]
153    #[case::float_with_uppercase_negative_exponent("5.5E-5", 5.5e-5)]
154    fn number_token_parses(#[case] input: &str, #[case] expected: f64) {
155        let token = get_next_token(input);
156        let expected = Token::Number(expected);
157        assert_eq!(token, expected);
158    }
159
160    #[test]
161    #[should_panic]
162    fn number_token_parse_fails_when_wrong_decimal_separator() {
163        let input = "5,5";
164        // This fails due to the fact that this input is three tokens length
165        // and the get_next_token asserts that there are no tokens left
166        get_next_token(input);
167    }
168
169    #[rstest]
170    #[case::simple("key")]
171    #[case::with_underscore("key_with_underscore")]
172    #[case::with_numbers("key_with_123_numbers")]
173    #[case::with_dash("key-with-dash")]
174    #[case::with_dash_and_underscore("key-with-dash_and_underscore")]
175    #[case::with_caps("KeyWithCaps")]
176    #[case::starting_with_underscore("_key")]
177    fn identifier_token_parses(#[case] input: &str) {
178        let expected = Token::Identifier(input.to_string());
179        let token = get_next_token(input);
180        assert_eq!(token, expected);
181    }
182
183    #[rstest]
184    #[case::simple(r#""JavaScript""#, "JavaScript")]
185    #[case::with_space(r#""Java Script""#, "Java Script")]
186    #[case::with_double_commas(r#""Java\"Script""#, "Java\"Script")]
187    #[case::with_single_commas(r#""Java'Script""#, "Java'Script")]
188    #[case::double_quoted_with_single_commas(r#""Java'Script""#, "Java'Script")]
189    #[case::newline(r#""Java\nScript""#, "Java\nScript")]
190    #[case::tab(r#""Java\tScript""#, "Java\tScript")]
191    #[case::backslash(r#""Java\\Script""#, "Java\\Script")]
192    #[case::backslash_and_quote(r#""Java\\\"Script""#, "Java\\\"Script")]
193    #[case::mixed(r#""/Jav\r\n\ta\\\"Scri\"pt\n""#, "/Jav\r\n\ta\\\"Scri\"pt\n")]
194    fn string_token_parses(#[case] input: &str, #[case] expected: &str) {
195        let expected = Token::String(expected.to_string());
196        let token = get_next_token(input);
197        assert_eq!(token, expected);
198    }
199
200    // TODO: change the lexer so an `Err` variant is returned instead of panicking
201    #[test]
202    #[should_panic]
203    fn string_token_parse_unescape_fails_when_malformed_escaped_input() {
204        let input = r#""Java\\"Script""#;
205        get_next_token(input);
206    }
207
208    // Single quoted strings are not allowed
209    #[test]
210    #[should_panic]
211    fn string_token_parse_fails_when_single_quoted() {
212        let input = r#"'Java Script'"#;
213        get_next_token(input);
214    }
215}