sixtyfps_compilerlib/
lexer.rs

1// Copyright © SixtyFPS GmbH <info@sixtyfps.io>
2// SPDX-License-Identifier: (GPL-3.0-only OR LicenseRef-SixtyFPS-commercial)
3
4//! This module contains the code for the lexer.
5//!
6//! It is kind of shared with parser.rs, which implements the lex_next_token based on the macro_rules
7//! that declares token
8
9#[derive(Default)]
10pub struct LexState {
11    /// The top of the stack is the level of embedded braces `{`.
12    /// So we must still lex so many '}' before re-entering into a string mode and pop the stack.
13    template_string_stack: Vec<u32>,
14}
15
16/// This trait is used by the `crate::parser::lex_next_token` function and is implemented
17/// for rule passed to the macro which can be either a string literal, or a function
18pub trait LexingRule {
19    /// Return the size of the match for this rule, or 0 if there is no match
20    fn lex(&self, text: &str, state: &mut LexState) -> usize;
21}
22
23impl<'a> LexingRule for &'a str {
24    #[inline]
25    fn lex(&self, text: &str, _: &mut LexState) -> usize {
26        if text.starts_with(*self) {
27            self.len()
28        } else {
29            0
30        }
31    }
32}
33
34impl<F: Fn(&str, &mut LexState) -> usize> LexingRule for F {
35    #[inline]
36    fn lex(&self, text: &str, state: &mut LexState) -> usize {
37        (self)(text, state)
38    }
39}
40
41pub fn lex_whitespace(text: &str, _: &mut LexState) -> usize {
42    let mut len = 0;
43    let chars = text.chars();
44    for c in chars {
45        if !c.is_whitespace() {
46            break;
47        }
48        len += c.len_utf8();
49    }
50    len
51}
52
53pub fn lex_comment(text: &str, _: &mut LexState) -> usize {
54    // FIXME: could report proper error if not properly terminated
55    if text.starts_with("//") {
56        return text.find(&['\n', '\r'] as &[_]).unwrap_or(text.len());
57    }
58    if text.starts_with("/*") {
59        let mut nested = 0;
60        let mut offset = 2;
61        let bytes = text.as_bytes();
62        while offset < bytes.len() {
63            if let Some(star) = bytes[offset..].iter().position(|c| *c == b'*') {
64                let star = star + offset;
65                if star > offset && bytes[star - 1] == b'/' {
66                    nested += 1;
67                    offset = star + 1;
68                } else if star < bytes.len() - 1 && bytes[star + 1] == b'/' {
69                    if nested == 0 {
70                        return star + 2;
71                    }
72                    nested -= 1;
73                    offset = star + 2;
74                } else {
75                    offset = star + 1;
76                }
77            } else {
78                // Unterminated
79                return 0;
80            }
81        }
82        // Unterminated
83        return 0;
84    }
85
86    0
87}
88
89pub fn lex_string(text: &str, state: &mut LexState) -> usize {
90    if let Some(brace_level) = state.template_string_stack.last_mut() {
91        if text.starts_with('{') {
92            *brace_level += 1;
93            return 0;
94        } else if text.starts_with('}') {
95            if *brace_level > 0 {
96                *brace_level -= 1;
97                return 0;
98            } else {
99                state.template_string_stack.pop();
100            }
101        } else if !text.starts_with('"') {
102            return 0;
103        }
104    } else if !text.starts_with('"') {
105        return 0;
106    }
107    let text_len = text.as_bytes().len();
108    let mut end = 1; // skip the '"'
109    loop {
110        let stop = match text[end..].find(&['"', '\\'][..]) {
111            Some(stop) => end + stop,
112            // FIXME: report an error for unterminated string
113            None => return 0,
114        };
115        match text.as_bytes()[stop] {
116            b'"' => {
117                return stop + 1;
118            }
119            b'\\' => {
120                if text_len <= stop + 1 {
121                    // FIXME: report an error for unterminated string
122                    return 0;
123                }
124                if text.as_bytes()[stop + 1] == b'{' {
125                    state.template_string_stack.push(0);
126                    return stop + 2;
127                }
128                end = stop + 1 + text[stop + 1..].chars().next().map_or(0, |c| c.len_utf8())
129            }
130            _ => unreachable!(),
131        }
132    }
133}
134
135pub fn lex_number(text: &str, _: &mut LexState) -> usize {
136    let mut len = 0;
137    let mut chars = text.chars();
138    let mut had_period = false;
139    while let Some(c) = chars.next() {
140        if !c.is_ascii_digit() {
141            if !had_period && c == '.' && len > 0 {
142                had_period = true;
143            } else {
144                if len > 0 {
145                    if c == '%' {
146                        return len + 1;
147                    }
148                    if c.is_ascii_alphabetic() {
149                        len += c.len_utf8();
150                        // The unit
151                        for c in chars {
152                            if !c.is_ascii_alphabetic() {
153                                return len;
154                            }
155                            len += c.len_utf8();
156                        }
157                    }
158                }
159                break;
160            }
161        }
162        len += c.len_utf8();
163    }
164    len
165}
166
167pub fn lex_color(text: &str, _: &mut LexState) -> usize {
168    if !text.starts_with('#') {
169        return 0;
170    }
171    let mut len = 1;
172    let chars = text[1..].chars();
173    for c in chars {
174        if !c.is_ascii_alphanumeric() {
175            break;
176        }
177        len += c.len_utf8();
178    }
179    len
180}
181
182pub fn lex_identifier(text: &str, _: &mut LexState) -> usize {
183    let mut len = 0;
184    let chars = text.chars();
185    for c in chars {
186        if !c.is_alphanumeric() && c != '_' && (c != '-' || len == 0) {
187            break;
188        }
189        len += c.len_utf8();
190    }
191    len
192}
193
194#[allow(clippy::needless_update)] // Token may have extra fields depending on selected features
195pub fn lex(mut source: &str) -> Vec<crate::parser::Token> {
196    let mut result = vec![];
197    let mut offset = 0;
198    let mut state = LexState::default();
199    while !source.is_empty() {
200        if let Some((len, kind)) = crate::parser::lex_next_token(source, &mut state) {
201            result.push(crate::parser::Token {
202                kind,
203                text: source[..len].into(),
204                offset,
205                ..Default::default()
206            });
207            offset += len;
208            source = &source[len..];
209        } else {
210            // FIXME: recover
211            result.push(crate::parser::Token {
212                kind: crate::parser::SyntaxKind::Error,
213                text: source.into(),
214                offset,
215                ..Default::default()
216            });
217            //offset += source.len();
218            break;
219        }
220    }
221    result
222}
223
224#[test]
225fn basic_lexer_test() {
226    fn compare(source: &str, expected: &[(crate::parser::SyntaxKind, &str)]) {
227        let actual = lex(source);
228        let actual =
229            actual.iter().map(|token| (token.kind, token.text.as_str())).collect::<Vec<_>>();
230        assert_eq!(actual.as_slice(), expected);
231    }
232
233    compare(
234        r#"45  /*hi/*_*/ho*/ "string""#,
235        &[
236            (crate::parser::SyntaxKind::NumberLiteral, "45"),
237            (crate::parser::SyntaxKind::Whitespace, "  "),
238            (crate::parser::SyntaxKind::Comment, "/*hi/*_*/ho*/"),
239            (crate::parser::SyntaxKind::Whitespace, " "),
240            (crate::parser::SyntaxKind::StringLiteral, r#""string""#),
241        ],
242    );
243
244    compare(
245        r#"12px+5.2+=0.7%"#,
246        &[
247            (crate::parser::SyntaxKind::NumberLiteral, "12px"),
248            (crate::parser::SyntaxKind::Plus, "+"),
249            (crate::parser::SyntaxKind::NumberLiteral, "5.2"),
250            (crate::parser::SyntaxKind::PlusEqual, "+="),
251            (crate::parser::SyntaxKind::NumberLiteral, "0.7%"),
252        ],
253    );
254    compare(
255        r#"aa_a.b1,c"#,
256        &[
257            (crate::parser::SyntaxKind::Identifier, "aa_a"),
258            (crate::parser::SyntaxKind::Dot, "."),
259            (crate::parser::SyntaxKind::Identifier, "b1"),
260            (crate::parser::SyntaxKind::Comma, ","),
261            (crate::parser::SyntaxKind::Identifier, "c"),
262        ],
263    );
264    compare(
265        r#"/*/**/*//**/*"#,
266        &[
267            (crate::parser::SyntaxKind::Comment, "/*/**/*/"),
268            (crate::parser::SyntaxKind::Comment, "/**/"),
269            (crate::parser::SyntaxKind::Star, "*"),
270        ],
271    );
272    compare(
273        "a//x\nb//y\r\nc//z",
274        &[
275            (crate::parser::SyntaxKind::Identifier, "a"),
276            (crate::parser::SyntaxKind::Comment, "//x"),
277            (crate::parser::SyntaxKind::Whitespace, "\n"),
278            (crate::parser::SyntaxKind::Identifier, "b"),
279            (crate::parser::SyntaxKind::Comment, "//y"),
280            (crate::parser::SyntaxKind::Whitespace, "\r\n"),
281            (crate::parser::SyntaxKind::Identifier, "c"),
282            (crate::parser::SyntaxKind::Comment, "//z"),
283        ],
284    );
285    compare(r#""x""#, &[(crate::parser::SyntaxKind::StringLiteral, r#""x""#)]);
286    compare(
287        r#"a"\"\\"x"#,
288        &[
289            (crate::parser::SyntaxKind::Identifier, "a"),
290            (crate::parser::SyntaxKind::StringLiteral, r#""\"\\""#),
291            (crate::parser::SyntaxKind::Identifier, "x"),
292        ],
293    );
294    compare(
295        r#""a\{b{c}d"e\{f}g"h}i"j"#,
296        &[
297            (crate::parser::SyntaxKind::StringLiteral, r#""a\{"#),
298            (crate::parser::SyntaxKind::Identifier, "b"),
299            (crate::parser::SyntaxKind::LBrace, "{"),
300            (crate::parser::SyntaxKind::Identifier, "c"),
301            (crate::parser::SyntaxKind::RBrace, "}"),
302            (crate::parser::SyntaxKind::Identifier, "d"),
303            (crate::parser::SyntaxKind::StringLiteral, r#""e\{"#),
304            (crate::parser::SyntaxKind::Identifier, "f"),
305            (crate::parser::SyntaxKind::StringLiteral, r#"}g""#),
306            (crate::parser::SyntaxKind::Identifier, "h"),
307            (crate::parser::SyntaxKind::StringLiteral, r#"}i""#),
308            (crate::parser::SyntaxKind::Identifier, "j"),
309        ],
310    );
311
312    // Fuzzer tests:
313    compare(
314        r#"/**"#,
315        &[
316            (crate::parser::SyntaxKind::Div, "/"),
317            (crate::parser::SyntaxKind::Star, "*"),
318            (crate::parser::SyntaxKind::Star, "*"),
319        ],
320    );
321    compare(r#""\"#, &[(crate::parser::SyntaxKind::Error, "\"\\")]);
322    compare(r#""\ޱ"#, &[(crate::parser::SyntaxKind::Error, "\"\\ޱ")]);
323}