jsona/
syntax.rs

1//! Declaration of the syntax tokens and lexer implementation.
2
3#![allow(non_camel_case_types)]
4
5use logos::{Lexer, Logos};
6
7/// Enum containing all the tokens in a syntax tree.
8#[derive(Logos, Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
9#[repr(u16)]
10pub enum SyntaxKind {
11    #[regex(r"([ \t])+")]
12    WHITESPACE = 0,
13
14    #[regex(r"(\n|\r\n)+")]
15    NEWLINE,
16
17    #[regex(r"/\*", lex_comment_block)]
18    BLOCK_COMMENT,
19
20    #[regex(r"//[^\n\r]*")]
21    LINE_COMMENT,
22
23    #[regex(r"[A-Za-z0-9_]+", priority = 2)]
24    IDENT,
25
26    #[regex(r"@[A-Za-z0-9_]*")]
27    ANNOTATION_KEY,
28
29    /// Not part of the regular JSONA syntax, only used to allow
30    /// glob patterns in keys.
31    #[regex(r"[*?A-Za-z0-9_]+")]
32    IDENT_WITH_GLOB,
33
34    #[token(".")]
35    PERIOD,
36
37    #[token(",")]
38    COMMA,
39
40    #[token(":")]
41    COLON,
42
43    #[regex(r#"'"#, lex_single_quote)]
44    SINGLE_QUOTE,
45
46    #[regex(r#"""#, lex_double_quote)]
47    DOUBLE_QUOTE,
48
49    #[regex(r#"`"#, lex_backtick_quote)]
50    BACKTICK_QUOTE,
51
52    #[regex(r"[+-]?[0-9_]+", priority = 4)]
53    INTEGER,
54
55    #[regex(r"0x[0-9A-Fa-f_]+")]
56    INTEGER_HEX,
57
58    #[regex(r"0o[0-7_]+")]
59    INTEGER_OCT,
60
61    #[regex(r"0b(0|1|_)+")]
62    INTEGER_BIN,
63
64    #[regex(
65        r"[-+]?((([0-9_]+)?(\.[0-9_]+)|([0-9_]+\.)([0-9_]+)?)?([eE][+-]?[0-9_]+)?|nan|inf)",
66        priority = 3
67    )]
68    FLOAT,
69
70    #[regex(r"true|false")]
71    BOOL,
72
73    #[token("null")]
74    NULL,
75
76    #[token("(")]
77    PARENTHESES_START,
78
79    #[token(")")]
80    PARENTHESES_END,
81
82    #[token("[")]
83    BRACKET_START,
84
85    #[token("]")]
86    BRACKET_END,
87
88    #[token("{")]
89    BRACE_START,
90
91    #[token("}")]
92    BRACE_END,
93
94    // composite types
95    KEY,
96    SCALAR,
97    PROPERTY,
98    OBJECT,
99    ARRAY,
100
101    ANNOTATION_PROPERTY,
102    ANNOTATION_VALUE,
103
104    #[error]
105    ERROR,
106
107    KEYS,
108    ANNOTATIONS,
109    VALUE,
110}
111
112impl SyntaxKind {
113    pub fn is_comment(self) -> bool {
114        use SyntaxKind::*;
115        matches!(self, LINE_COMMENT | BLOCK_COMMENT)
116    }
117
118    pub fn is_ws(self) -> bool {
119        use SyntaxKind::*;
120        matches!(self, WHITESPACE | NEWLINE)
121    }
122
123    pub fn is_compose(self) -> bool {
124        use SyntaxKind::*;
125        matches!(self, OBJECT | ARRAY)
126    }
127
128    pub fn is_key(self) -> bool {
129        use SyntaxKind::*;
130        matches!(
131            self,
132            IDENT
133                | IDENT_WITH_GLOB
134                | NULL
135                | BOOL
136                | INTEGER_HEX
137                | INTEGER_BIN
138                | INTEGER_OCT
139                | INTEGER
140                | SINGLE_QUOTE
141                | DOUBLE_QUOTE
142                | BACKTICK_QUOTE
143                | FLOAT
144        )
145    }
146
147    pub fn is_ws_or_comment(self) -> bool {
148        self.is_ws() || self.is_comment()
149    }
150}
151
152impl From<SyntaxKind> for rowan::SyntaxKind {
153    fn from(kind: SyntaxKind) -> Self {
154        Self(kind as u16)
155    }
156}
157
158#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
159pub enum Lang {}
160impl rowan::Language for Lang {
161    type Kind = SyntaxKind;
162    fn kind_from_raw(raw: rowan::SyntaxKind) -> Self::Kind {
163        assert!(raw.0 <= SyntaxKind::VALUE as u16);
164        unsafe { std::mem::transmute::<u16, SyntaxKind>(raw.0) }
165    }
166    fn kind_to_raw(kind: Self::Kind) -> rowan::SyntaxKind {
167        kind.into()
168    }
169}
170
171pub type SyntaxNode = rowan::SyntaxNode<Lang>;
172pub type SyntaxToken = rowan::SyntaxToken<Lang>;
173pub type SyntaxElement = rowan::NodeOrToken<SyntaxNode, SyntaxToken>;
174
175pub fn stringify_syntax(
176    indent: usize,
177    element: SyntaxElement,
178) -> Result<String, Box<dyn std::error::Error>> {
179    let mut buf: Vec<u8> = vec![];
180    write_syntax(&mut buf, indent, element)?;
181    Ok(std::str::from_utf8(&buf)?.to_string())
182}
183
184pub fn write_syntax<T: std::io::Write>(
185    w: &mut T,
186    indent: usize,
187    element: SyntaxElement,
188) -> Result<(), Box<dyn std::error::Error>> {
189    let kind: SyntaxKind = element.kind();
190    write!(w, "{:indent$}", "", indent = indent)?;
191    match element {
192        rowan::NodeOrToken::Node(node) => {
193            writeln!(w, "{:?}@{:?}", kind, node.text_range())?;
194            for child in node.children_with_tokens() {
195                write_syntax(w, indent + 2, child)?;
196            }
197        }
198
199        rowan::NodeOrToken::Token(token) => {
200            writeln!(w, "{:?}@{:?} {:?}", kind, token.text_range(), token.text())?;
201        }
202    }
203    Ok(())
204}
205
206fn lex_comment_block(lex: &mut Lexer<SyntaxKind>) -> bool {
207    let remainder: &str = lex.remainder();
208
209    let mut asterisk_found = false;
210
211    let mut total_len = 0;
212
213    for c in remainder.chars() {
214        total_len += c.len_utf8();
215
216        if c == '*' {
217            asterisk_found = true;
218            continue;
219        }
220
221        if c == '/' && asterisk_found {
222            lex.bump(remainder[0..total_len].as_bytes().len());
223            return true;
224        }
225
226        asterisk_found = false;
227    }
228    lex.bump(remainder[0..total_len].as_bytes().len());
229    false
230}
231
232fn lex_backtick_quote(lex: &mut Lexer<SyntaxKind>) -> bool {
233    lex_string(lex, '`', true)
234}
235
236fn lex_single_quote(lex: &mut Lexer<SyntaxKind>) -> bool {
237    lex_string(lex, '\'', false)
238}
239
240fn lex_double_quote(lex: &mut Lexer<SyntaxKind>) -> bool {
241    lex_string(lex, '"', false)
242}
243
244fn lex_string(lex: &mut Lexer<SyntaxKind>, quote: char, multiline: bool) -> bool {
245    let remainder: &str = lex.remainder();
246    let mut escaped = false;
247
248    let mut total_len = 0;
249
250    for c in remainder.chars() {
251        total_len += c.len_utf8();
252
253        if c == '\\' {
254            escaped = !escaped;
255            continue;
256        }
257
258        if (c == quote && !escaped) || (c == '\n' && !multiline) {
259            lex.bump(remainder[0..total_len].as_bytes().len());
260            return true;
261        }
262
263        escaped = false;
264    }
265    lex.bump(remainder[0..total_len].as_bytes().len());
266    false
267}
268
269#[cfg(test)]
270mod tests {
271    use super::*;
272
273    macro_rules! assert_lex {
274        ($text:literal, $kind:expr) => {
275            let mut lex = SyntaxKind::lexer($text);
276            assert_eq!(lex.next(), Some($kind));
277        };
278    }
279
280    #[test]
281    fn test_lex() {
282        assert_lex!("/* comment */", SyntaxKind::BLOCK_COMMENT);
283        assert_lex!("// comment", SyntaxKind::LINE_COMMENT);
284        assert_lex!("foo", SyntaxKind::IDENT);
285        assert_lex!(r#""I'm a string\u00E9""#, SyntaxKind::DOUBLE_QUOTE);
286        assert_lex!(r#"'Say "hello"'"#, SyntaxKind::SINGLE_QUOTE);
287        assert_lex!(r#"`hello world`"#, SyntaxKind::BACKTICK_QUOTE);
288        assert_lex!("123", SyntaxKind::INTEGER);
289        assert_lex!("0xDEADBEEF", SyntaxKind::INTEGER_HEX);
290        assert_lex!("0xDE_ADBE", SyntaxKind::INTEGER_HEX);
291        assert_lex!("0o4567", SyntaxKind::INTEGER_OCT);
292        assert_lex!("0o45_67", SyntaxKind::INTEGER_OCT);
293        assert_lex!("0b11010110", SyntaxKind::INTEGER_BIN);
294        assert_lex!("0b1101_0110", SyntaxKind::INTEGER_BIN);
295        assert_lex!("3.14", SyntaxKind::FLOAT);
296        assert_lex!("-.14", SyntaxKind::FLOAT);
297        assert_lex!("-3.", SyntaxKind::FLOAT);
298        assert_lex!("true", SyntaxKind::BOOL);
299        assert_lex!("false", SyntaxKind::BOOL);
300        assert_lex!("null", SyntaxKind::NULL);
301        assert_lex!("api*", SyntaxKind::IDENT_WITH_GLOB);
302        assert_lex!("a?i*", SyntaxKind::IDENT_WITH_GLOB);
303        assert_lex!("*", SyntaxKind::IDENT_WITH_GLOB);
304        assert_lex!("**", SyntaxKind::IDENT_WITH_GLOB);
305    }
306}