i_slint_compiler/
lexer.rs

1// Copyright © SixtyFPS GmbH <info@slint.dev>
2// SPDX-License-Identifier: GPL-3.0-only OR LicenseRef-Slint-Royalty-free-2.0 OR LicenseRef-Slint-Software-3.0
3
4//! This module contains the code for the lexer.
5//!
6//! It is kind of shared with parser.rs, which implements the lex_next_token based on the macro_rules
7//! that declares token
8
9use crate::parser::SyntaxKind;
10
11#[derive(Default)]
12pub struct LexState {
13    /// The top of the stack is the level of embedded braces `{`.
14    /// So we must still lex so many '}' before re-entering into a string mode and pop the stack.
15    template_string_stack: Vec<u32>,
16}
17
18/// This trait is used by the `crate::parser::lex_next_token` function and is implemented
19/// for rule passed to the macro which can be either a string literal, or a function
20pub trait LexingRule {
21    /// Return the size of the match for this rule, or 0 if there is no match
22    fn lex(&self, text: &str, state: &mut LexState) -> usize;
23}
24
25impl LexingRule for &str {
26    #[inline]
27    fn lex(&self, text: &str, _: &mut LexState) -> usize {
28        if text.starts_with(*self) {
29            self.len()
30        } else {
31            0
32        }
33    }
34}
35
36impl<F: Fn(&str, &mut LexState) -> usize> LexingRule for F {
37    #[inline]
38    fn lex(&self, text: &str, state: &mut LexState) -> usize {
39        (self)(text, state)
40    }
41}
42
43pub fn lex_whitespace(text: &str, _: &mut LexState) -> usize {
44    let mut len = 0;
45    let chars = text.chars();
46    for c in chars {
47        if !c.is_whitespace() && !['\u{0002}', '\u{0003}'].contains(&c) {
48            break;
49        }
50        len += c.len_utf8();
51    }
52    len
53}
54
55pub fn lex_comment(text: &str, _: &mut LexState) -> usize {
56    // FIXME: could report proper error if not properly terminated
57    if text.starts_with("//") {
58        return text.find(&['\n', '\r'] as &[_]).unwrap_or(text.len());
59    }
60    if text.starts_with("/*") {
61        let mut nested = 0;
62        let mut offset = 2;
63        let bytes = text.as_bytes();
64        while offset < bytes.len() {
65            if let Some(star) = bytes[offset..].iter().position(|c| *c == b'*') {
66                let star = star + offset;
67                if star > offset && bytes[star - 1] == b'/' {
68                    nested += 1;
69                    offset = star + 1;
70                } else if star < bytes.len() - 1 && bytes[star + 1] == b'/' {
71                    if nested == 0 {
72                        return star + 2;
73                    }
74                    nested -= 1;
75                    offset = star + 2;
76                } else {
77                    offset = star + 1;
78                }
79            } else {
80                // Unterminated
81                return 0;
82            }
83        }
84        // Unterminated
85        return 0;
86    }
87
88    0
89}
90
91pub fn lex_string(text: &str, state: &mut LexState) -> usize {
92    if let Some(brace_level) = state.template_string_stack.last_mut() {
93        if text.starts_with('{') {
94            *brace_level += 1;
95            return 0;
96        } else if text.starts_with('}') {
97            if *brace_level > 0 {
98                *brace_level -= 1;
99                return 0;
100            } else {
101                state.template_string_stack.pop();
102            }
103        } else if !text.starts_with('"') {
104            return 0;
105        }
106    } else if !text.starts_with('"') {
107        return 0;
108    }
109    let text_len = text.as_bytes().len();
110    let mut end = 1; // skip the '"'
111    loop {
112        let stop = match text[end..].find(&['"', '\\'][..]) {
113            Some(stop) => end + stop,
114            // FIXME: report an error for unterminated string
115            None => return 0,
116        };
117        match text.as_bytes()[stop] {
118            b'"' => {
119                return stop + 1;
120            }
121            b'\\' => {
122                if text_len <= stop + 1 {
123                    // FIXME: report an error for unterminated string
124                    return 0;
125                }
126                if text.as_bytes()[stop + 1] == b'{' {
127                    state.template_string_stack.push(0);
128                    return stop + 2;
129                }
130                end = stop + 1 + text[stop + 1..].chars().next().map_or(0, |c| c.len_utf8())
131            }
132            _ => unreachable!(),
133        }
134    }
135}
136
137pub fn lex_number(text: &str, _: &mut LexState) -> usize {
138    let mut len = 0;
139    let mut chars = text.chars();
140    let mut had_period = false;
141    while let Some(c) = chars.next() {
142        if !c.is_ascii_digit() {
143            if !had_period && c == '.' && len > 0 {
144                had_period = true;
145            } else {
146                if len > 0 {
147                    if c == '%' {
148                        return len + 1;
149                    }
150                    if c.is_ascii_alphabetic() {
151                        len += c.len_utf8();
152                        // The unit
153                        for c in chars {
154                            if !c.is_ascii_alphabetic() {
155                                return len;
156                            }
157                            len += c.len_utf8();
158                        }
159                    }
160                }
161                break;
162            }
163        }
164        len += c.len_utf8();
165    }
166    len
167}
168
169pub fn lex_color(text: &str, _: &mut LexState) -> usize {
170    if !text.starts_with('#') {
171        return 0;
172    }
173    let mut len = 1;
174    let chars = text[1..].chars();
175    for c in chars {
176        if !c.is_ascii_alphanumeric() {
177            break;
178        }
179        len += c.len_utf8();
180    }
181    len
182}
183
184pub fn lex_identifier(text: &str, _: &mut LexState) -> usize {
185    let mut len = 0;
186    let chars = text.chars();
187    for c in chars {
188        if !c.is_alphanumeric() && c != '_' && (c != '-' || len == 0) {
189            break;
190        }
191        len += c.len_utf8();
192    }
193    len
194}
195
196#[allow(clippy::needless_update)] // Token may have extra fields depending on selected features
197pub fn lex(mut source: &str) -> Vec<crate::parser::Token> {
198    let mut result = vec![];
199    let mut offset = 0;
200    let mut state = LexState::default();
201    if source.starts_with("\u{FEFF}") {
202        // Skip BOM
203        result.push(crate::parser::Token {
204            kind: SyntaxKind::Whitespace,
205            text: source[..3].into(),
206            offset: 0,
207            ..Default::default()
208        });
209        source = &source[3..];
210        offset += 3;
211    }
212    while !source.is_empty() {
213        if let Some((len, kind)) = crate::parser::lex_next_token(source, &mut state) {
214            result.push(crate::parser::Token {
215                kind,
216                text: source[..len].into(),
217                offset,
218                ..Default::default()
219            });
220            offset += len;
221            source = &source[len..];
222        } else {
223            // FIXME: recover
224            result.push(crate::parser::Token {
225                kind: SyntaxKind::Error,
226                text: source.into(),
227                offset,
228                ..Default::default()
229            });
230            //offset += source.len();
231            break;
232        }
233    }
234    result
235}
236
237#[test]
238fn basic_lexer_test() {
239    fn compare(source: &str, expected: &[(SyntaxKind, &str)]) {
240        let actual = lex(source);
241        let actual =
242            actual.iter().map(|token| (token.kind, token.text.as_str())).collect::<Vec<_>>();
243        assert_eq!(actual.as_slice(), expected);
244    }
245
246    compare(
247        r#"45  /*hi/*_*/ho*/ "string""#,
248        &[
249            (SyntaxKind::NumberLiteral, "45"),
250            (SyntaxKind::Whitespace, "  "),
251            (SyntaxKind::Comment, "/*hi/*_*/ho*/"),
252            (SyntaxKind::Whitespace, " "),
253            (SyntaxKind::StringLiteral, r#""string""#),
254        ],
255    );
256
257    compare(
258        r#"12px+5.2+=0.7%"#,
259        &[
260            (SyntaxKind::NumberLiteral, "12px"),
261            (SyntaxKind::Plus, "+"),
262            (SyntaxKind::NumberLiteral, "5.2"),
263            (SyntaxKind::PlusEqual, "+="),
264            (SyntaxKind::NumberLiteral, "0.7%"),
265        ],
266    );
267    compare(
268        r#"aa_a.b1,c"#,
269        &[
270            (SyntaxKind::Identifier, "aa_a"),
271            (SyntaxKind::Dot, "."),
272            (SyntaxKind::Identifier, "b1"),
273            (SyntaxKind::Comma, ","),
274            (SyntaxKind::Identifier, "c"),
275        ],
276    );
277    compare(
278        r#"/*/**/*//**/*"#,
279        &[
280            (SyntaxKind::Comment, "/*/**/*/"),
281            (SyntaxKind::Comment, "/**/"),
282            (SyntaxKind::Star, "*"),
283        ],
284    );
285    compare(
286        "a//x\nb//y\r\nc//z",
287        &[
288            (SyntaxKind::Identifier, "a"),
289            (SyntaxKind::Comment, "//x"),
290            (SyntaxKind::Whitespace, "\n"),
291            (SyntaxKind::Identifier, "b"),
292            (SyntaxKind::Comment, "//y"),
293            (SyntaxKind::Whitespace, "\r\n"),
294            (SyntaxKind::Identifier, "c"),
295            (SyntaxKind::Comment, "//z"),
296        ],
297    );
298    compare(r#""x""#, &[(SyntaxKind::StringLiteral, r#""x""#)]);
299    compare(
300        r#"a"\"\\"x"#,
301        &[
302            (SyntaxKind::Identifier, "a"),
303            (SyntaxKind::StringLiteral, r#""\"\\""#),
304            (SyntaxKind::Identifier, "x"),
305        ],
306    );
307    compare(
308        r#""a\{b{c}d"e\{f}g"h}i"j"#,
309        &[
310            (SyntaxKind::StringLiteral, r#""a\{"#),
311            (SyntaxKind::Identifier, "b"),
312            (SyntaxKind::LBrace, "{"),
313            (SyntaxKind::Identifier, "c"),
314            (SyntaxKind::RBrace, "}"),
315            (SyntaxKind::Identifier, "d"),
316            (SyntaxKind::StringLiteral, r#""e\{"#),
317            (SyntaxKind::Identifier, "f"),
318            (SyntaxKind::StringLiteral, r#"}g""#),
319            (SyntaxKind::Identifier, "h"),
320            (SyntaxKind::StringLiteral, r#"}i""#),
321            (SyntaxKind::Identifier, "j"),
322        ],
323    );
324
325    // Fuzzer tests:
326    compare(r#"/**"#, &[(SyntaxKind::Div, "/"), (SyntaxKind::Star, "*"), (SyntaxKind::Star, "*")]);
327    compare(r#""\"#, &[(SyntaxKind::Error, "\"\\")]);
328    compare(r#""\ޱ"#, &[(SyntaxKind::Error, "\"\\ޱ")]);
329}
330
331/// Given the source of a rust file, find the occurrence of each `slint!(...)`macro.
332/// Return an iterator with the range of the location of the macro in the original source
333pub fn locate_slint_macro(rust_source: &str) -> impl Iterator<Item = core::ops::Range<usize>> + '_ {
334    let mut begin = 0;
335    std::iter::from_fn(move || {
336        let (open, close) = loop {
337            if let Some(m) = rust_source[begin..].find("slint") {
338                // heuristics to find if we are not in a comment or a string literal. Not perfect, but should work in most cases
339                if let Some(x) = rust_source[begin..(begin + m)].rfind(['\\', '\n', '/', '\"']) {
340                    if rust_source.as_bytes()[begin + x] != b'\n' {
341                        begin += m + 5;
342                        begin += rust_source[begin..].find(['\n']).unwrap_or(0);
343                        continue;
344                    }
345                }
346                begin += m + 5;
347                while rust_source[begin..].starts_with(' ') {
348                    begin += 1;
349                }
350                if !rust_source[begin..].starts_with('!') {
351                    continue;
352                }
353                begin += 1;
354                while rust_source[begin..].starts_with(' ') {
355                    begin += 1;
356                }
357                let Some(open) = rust_source.as_bytes().get(begin) else { continue };
358                match open {
359                    b'{' => break (SyntaxKind::LBrace, SyntaxKind::RBrace),
360                    b'[' => break (SyntaxKind::LBracket, SyntaxKind::RBracket),
361                    b'(' => break (SyntaxKind::LParent, SyntaxKind::RParent),
362                    _ => continue,
363                }
364            } else {
365                // No macro found, just return
366                return None;
367            }
368        };
369
370        begin += 1;
371
372        // Now find the matching closing delimiter
373        // Technically, we should be lexing rust, not slint
374        let mut state = LexState::default();
375        let start = begin;
376        let mut end = begin;
377        let mut level = 0;
378        while !rust_source[end..].is_empty() {
379            let len = match crate::parser::lex_next_token(&rust_source[end..], &mut state) {
380                Some((len, x)) if x == open => {
381                    level += 1;
382                    len
383                }
384                Some((_, x)) if x == close && level == 0 => {
385                    break;
386                }
387                Some((len, x)) if x == close => {
388                    level -= 1;
389                    len
390                }
391                Some((len, _)) => len,
392                None => {
393                    // Lex error
394                    break;
395                }
396            };
397            if len == 0 {
398                break; // Shouldn't happen
399            }
400            end += len;
401        }
402        begin = end;
403        Some(start..end)
404    })
405}
406
407#[test]
408fn test_locate_rust_macro() {
409    #[track_caller]
410    fn do_test(source: &str, captures: &[&str]) {
411        let result = locate_slint_macro(source).map(|r| &source[r]).collect::<Vec<_>>();
412        assert_eq!(&result, captures);
413    }
414
415    do_test("\nslint{!{}}", &[]);
416    do_test(
417        "//slint!(123)\nslint!(456)\nslint ![789]\n/*slint!{abc}*/\nslint! {def}",
418        &["456", "789", "def"],
419    );
420    do_test("slint!(slint!(abc))slint!()", &["slint!(abc)", ""]);
421}
422
423/// Given a Rust source file contents, return a string containing the contents of the first `slint!` macro
424///
425/// All the other bytes which are not newlines are replaced by space. This allow offsets in the resulting
426/// string to preserve line and column number.
427///
428/// The last byte before the Slint area will be \u{2} (ASCII Start-of-Text), the first byte after
429/// the slint code will be \u{3} (ASCII End-of-Text), so that programs can find the area of slint code
430/// within the program.
431///
432/// Note that the slint compiler considers Start-of-Text and End-of-Text as whitespace and will treat them
433/// accordingly.
434pub fn extract_rust_macro(rust_source: String) -> Option<String> {
435    let core::ops::Range { start, end } = locate_slint_macro(&rust_source).next()?;
436    let mut bytes = rust_source.into_bytes();
437    for c in &mut bytes[..start] {
438        if *c != b'\n' {
439            *c = b' '
440        }
441    }
442
443    if start > 0 {
444        bytes[start - 1] = 2;
445    }
446    if end < bytes.len() {
447        bytes[end] = 3;
448
449        for c in &mut bytes[end + 1..] {
450            if *c != b'\n' {
451                *c = b' '
452            }
453        }
454    }
455    Some(String::from_utf8(bytes).expect("We just added spaces"))
456}
457
458#[test]
459fn test_extract_rust_macro() {
460    assert_eq!(extract_rust_macro("\nslint{!{}}".into()), None);
461    assert_eq!(
462        extract_rust_macro(
463            "abc\n€\nslint !  {x \" \\\" }🦀\" { () {}\n {} }xx =}-  ;}\n xxx \n yyy {}\n".into(),
464        ),
465        Some(
466            "   \n   \n         \u{2}x \" \\\" }🦀\" { () {}\n {} }xx =\u{3}     \n     \n       \n".into(),
467        )
468    );
469
470    assert_eq!(
471        extract_rust_macro("xx\nabcd::slint!{abc{}efg".into()),
472        Some("  \n            \u{2}abc{}efg".into())
473    );
474    assert_eq!(
475        extract_rust_macro("slint!\nnot.\nslint!{\nunterminated\nxxx".into()),
476        Some("      \n    \n      \u{2}\nunterminated\nxxx".into())
477    );
478    assert_eq!(extract_rust_macro("foo\n/* slint! { hello }\n".into()), None);
479    assert_eq!(extract_rust_macro("foo\n/* slint::slint! { hello }\n".into()), None);
480    assert_eq!(
481        extract_rust_macro("foo\n// slint! { hello }\nslint!{world}\na".into()),
482        Some("   \n                   \n      \u{2}world\u{3}\n ".into())
483    );
484    assert_eq!(extract_rust_macro("foo\n\" slint! { hello }\"\n".into()), None);
485    assert_eq!(
486        extract_rust_macro(
487            "abc\n€\nslint !  (x /* \\\" )🦀*/ { () {}\n {} }xx =)-  ;}\n xxx \n yyy {}\n".into(),
488        ),
489        Some(
490            "   \n   \n         \u{2}x /* \\\" )🦀*/ { () {}\n {} }xx =\u{3}     \n     \n       \n".into(),
491        )
492    );
493    assert_eq!(
494        extract_rust_macro("abc slint![x slint!() [{[]}] s] abc".into()),
495        Some("          \u{0002}x slint!() [{[]}] s\u{0003}    ".into()),
496    );
497}