i_slint_compiler/
lexer.rs

1// Copyright © SixtyFPS GmbH <info@slint.dev>
2// SPDX-License-Identifier: GPL-3.0-only OR LicenseRef-Slint-Royalty-free-2.0 OR LicenseRef-Slint-Software-3.0
3
4//! This module contains the code for the lexer.
5//!
6//! It is kind of shared with parser.rs, which implements the lex_next_token based on the macro_rules
7//! that declares token
8
9use crate::parser::SyntaxKind;
10
11#[derive(Default)]
12pub struct LexState {
13    /// The top of the stack is the level of embedded braces `{`.
14    /// So we must still lex so many '}' before re-entering into a string mode and pop the stack.
15    template_string_stack: Vec<u32>,
16}
17
18/// This trait is used by the `crate::parser::lex_next_token` function and is implemented
19/// for rule passed to the macro which can be either a string literal, or a function
20pub trait LexingRule {
21    /// Return the size of the match for this rule, or 0 if there is no match
22    fn lex(&self, text: &str, state: &mut LexState) -> usize;
23}
24
25impl LexingRule for &str {
26    #[inline]
27    fn lex(&self, text: &str, _: &mut LexState) -> usize {
28        if text.starts_with(*self) { self.len() } else { 0 }
29    }
30}
31
32impl<F: Fn(&str, &mut LexState) -> usize> LexingRule for F {
33    #[inline]
34    fn lex(&self, text: &str, state: &mut LexState) -> usize {
35        (self)(text, state)
36    }
37}
38
39pub fn lex_whitespace(text: &str, _: &mut LexState) -> usize {
40    let mut len = 0;
41    let chars = text.chars();
42    for c in chars {
43        if !c.is_whitespace() && !['\u{0002}', '\u{0003}'].contains(&c) {
44            break;
45        }
46        len += c.len_utf8();
47    }
48    len
49}
50
51pub fn lex_comment(text: &str, _: &mut LexState) -> usize {
52    // FIXME: could report proper error if not properly terminated
53    if text.starts_with("//") {
54        return text.find(&['\n', '\r'] as &[_]).unwrap_or(text.len());
55    }
56    if text.starts_with("/*") {
57        let mut nested = 0;
58        let mut offset = 2;
59        let bytes = text.as_bytes();
60        while offset < bytes.len() {
61            if let Some(star) = bytes[offset..].iter().position(|c| *c == b'*') {
62                let star = star + offset;
63                if star > offset && bytes[star - 1] == b'/' {
64                    nested += 1;
65                    offset = star + 1;
66                } else if star < bytes.len() - 1 && bytes[star + 1] == b'/' {
67                    if nested == 0 {
68                        return star + 2;
69                    }
70                    nested -= 1;
71                    offset = star + 2;
72                } else {
73                    offset = star + 1;
74                }
75            } else {
76                // Unterminated
77                return 0;
78            }
79        }
80        // Unterminated
81        return 0;
82    }
83
84    0
85}
86
87pub fn lex_string(text: &str, state: &mut LexState) -> usize {
88    if let Some(brace_level) = state.template_string_stack.last_mut() {
89        if text.starts_with('{') {
90            *brace_level += 1;
91            return 0;
92        } else if text.starts_with('}') {
93            if *brace_level > 0 {
94                *brace_level -= 1;
95                return 0;
96            } else {
97                state.template_string_stack.pop();
98            }
99        } else if !text.starts_with('"') {
100            return 0;
101        }
102    } else if !text.starts_with('"') {
103        return 0;
104    }
105    let text_len = text.len();
106    let mut end = 1; // skip the '"'
107    loop {
108        let stop = match text[end..].find(&['"', '\\'][..]) {
109            Some(stop) => end + stop,
110            // FIXME: report an error for unterminated string
111            None => return 0,
112        };
113        match text.as_bytes()[stop] {
114            b'"' => {
115                return stop + 1;
116            }
117            b'\\' => {
118                if text_len <= stop + 1 {
119                    // FIXME: report an error for unterminated string
120                    return 0;
121                }
122                if text.as_bytes()[stop + 1] == b'{' {
123                    state.template_string_stack.push(0);
124                    return stop + 2;
125                }
126                end = stop + 1 + text[stop + 1..].chars().next().map_or(0, |c| c.len_utf8())
127            }
128            _ => unreachable!(),
129        }
130    }
131}
132
133pub fn lex_number(text: &str, _: &mut LexState) -> usize {
134    let mut len = 0;
135    let mut chars = text.chars();
136    let mut had_period = false;
137    while let Some(c) = chars.next() {
138        if !c.is_ascii_digit() {
139            if !had_period && c == '.' && len > 0 {
140                had_period = true;
141            } else {
142                if len > 0 {
143                    if c == '%' {
144                        return len + 1;
145                    }
146                    if c.is_ascii_alphabetic() {
147                        len += c.len_utf8();
148                        // The unit
149                        for c in chars {
150                            if !c.is_ascii_alphabetic() {
151                                return len;
152                            }
153                            len += c.len_utf8();
154                        }
155                    }
156                }
157                break;
158            }
159        }
160        len += c.len_utf8();
161    }
162    len
163}
164
165pub fn lex_color(text: &str, _: &mut LexState) -> usize {
166    if !text.starts_with('#') {
167        return 0;
168    }
169    let mut len = 1;
170    let chars = text[1..].chars();
171    for c in chars {
172        if !c.is_ascii_alphanumeric() {
173            break;
174        }
175        len += c.len_utf8();
176    }
177    len
178}
179
180pub fn lex_identifier(text: &str, _: &mut LexState) -> usize {
181    let mut len = 0;
182    let chars = text.chars();
183    for c in chars {
184        if !c.is_alphanumeric() && c != '_' && (c != '-' || len == 0) {
185            break;
186        }
187        len += c.len_utf8();
188    }
189    len
190}
191
192#[allow(clippy::needless_update)] // Token may have extra fields depending on selected features
193pub fn lex(mut source: &str) -> Vec<crate::parser::Token> {
194    let mut result = Vec::new();
195    let mut offset = 0;
196    let mut state = LexState::default();
197    if source.starts_with("\u{FEFF}") {
198        // Skip BOM
199        result.push(crate::parser::Token {
200            kind: SyntaxKind::Whitespace,
201            text: source[..3].into(),
202            offset: 0,
203            length: 3,
204            ..Default::default()
205        });
206        source = &source[3..];
207        offset += 3;
208    }
209    while !source.is_empty() {
210        let (len, kind) = crate::parser::lex_next_token(source, &mut state).unwrap_or_else(|| {
211            // Recover from errors by returning "Error" tokens for all individual characters
212            // that the lexer could not handle.
213            //
214            // Note: Make sure to actually consume a whole character (may be more than 1 byte with
215            // UTF-8 multi-byte characters)
216            //
217            // TODO: Replace with:
218            // (source.ceil_char_boundary(1), SyntaxKind::Error)
219            // Once MSRV is 1.91
220            let length = source.chars().next().map(char::len_utf8).unwrap_or_else(|| source.len());
221            (length, SyntaxKind::Error)
222        });
223        result.push(crate::parser::Token {
224            kind,
225            text: source[..len].into(),
226            offset,
227            length: len,
228            ..Default::default()
229        });
230        offset += len;
231        source = &source[len..];
232    }
233    result
234}
235
236#[test]
237fn basic_lexer_test() {
238    fn compare(source: &str, expected: &[(SyntaxKind, &str)]) {
239        let actual = lex(source);
240        let actual =
241            actual.iter().map(|token| (token.kind, token.text.as_str())).collect::<Vec<_>>();
242        assert_eq!(actual.as_slice(), expected);
243    }
244
245    compare(
246        r#"45  /*hi/*_*/ho*/ "string""#,
247        &[
248            (SyntaxKind::NumberLiteral, "45"),
249            (SyntaxKind::Whitespace, "  "),
250            (SyntaxKind::Comment, "/*hi/*_*/ho*/"),
251            (SyntaxKind::Whitespace, " "),
252            (SyntaxKind::StringLiteral, r#""string""#),
253        ],
254    );
255
256    compare(
257        r#"12px+5.2+=0.7%"#,
258        &[
259            (SyntaxKind::NumberLiteral, "12px"),
260            (SyntaxKind::Plus, "+"),
261            (SyntaxKind::NumberLiteral, "5.2"),
262            (SyntaxKind::PlusEqual, "+="),
263            (SyntaxKind::NumberLiteral, "0.7%"),
264        ],
265    );
266    compare(
267        r#"aa_a.b1,c"#,
268        &[
269            (SyntaxKind::Identifier, "aa_a"),
270            (SyntaxKind::Dot, "."),
271            (SyntaxKind::Identifier, "b1"),
272            (SyntaxKind::Comma, ","),
273            (SyntaxKind::Identifier, "c"),
274        ],
275    );
276    compare(
277        r#"/*/**/*//**/*"#,
278        &[
279            (SyntaxKind::Comment, "/*/**/*/"),
280            (SyntaxKind::Comment, "/**/"),
281            (SyntaxKind::Star, "*"),
282        ],
283    );
284    compare(
285        "a//x\nb//y\r\nc//z",
286        &[
287            (SyntaxKind::Identifier, "a"),
288            (SyntaxKind::Comment, "//x"),
289            (SyntaxKind::Whitespace, "\n"),
290            (SyntaxKind::Identifier, "b"),
291            (SyntaxKind::Comment, "//y"),
292            (SyntaxKind::Whitespace, "\r\n"),
293            (SyntaxKind::Identifier, "c"),
294            (SyntaxKind::Comment, "//z"),
295        ],
296    );
297    compare(r#""x""#, &[(SyntaxKind::StringLiteral, r#""x""#)]);
298    compare(
299        r#"a"\"\\"x"#,
300        &[
301            (SyntaxKind::Identifier, "a"),
302            (SyntaxKind::StringLiteral, r#""\"\\""#),
303            (SyntaxKind::Identifier, "x"),
304        ],
305    );
306    compare(
307        r#""a\{b{c}d"e\{f}g"h}i"j"#,
308        &[
309            (SyntaxKind::StringLiteral, r#""a\{"#),
310            (SyntaxKind::Identifier, "b"),
311            (SyntaxKind::LBrace, "{"),
312            (SyntaxKind::Identifier, "c"),
313            (SyntaxKind::RBrace, "}"),
314            (SyntaxKind::Identifier, "d"),
315            (SyntaxKind::StringLiteral, r#""e\{"#),
316            (SyntaxKind::Identifier, "f"),
317            (SyntaxKind::StringLiteral, r#"}g""#),
318            (SyntaxKind::Identifier, "h"),
319            (SyntaxKind::StringLiteral, r#"}i""#),
320            (SyntaxKind::Identifier, "j"),
321        ],
322    );
323
324    // Fuzzer tests:
325    compare(r#"/**"#, &[(SyntaxKind::Div, "/"), (SyntaxKind::Star, "*"), (SyntaxKind::Star, "*")]);
326    compare(r#""\"#, &[(SyntaxKind::Error, "\""), (SyntaxKind::Error, "\\")]);
327    compare(
328        r#""\ޱ"#,
329        &[(SyntaxKind::Error, "\""), (SyntaxKind::Error, "\\"), (SyntaxKind::Identifier, "ޱ")],
330    );
331}
332
333/// Given the source of a rust file, find the occurrence of each `slint!(...)`macro.
334/// Return an iterator with the range of the location of the macro in the original source
335pub fn locate_slint_macro(rust_source: &str) -> impl Iterator<Item = core::ops::Range<usize>> + '_ {
336    let mut begin = 0;
337    std::iter::from_fn(move || {
338        let (open, close) = loop {
339            if let Some(m) = rust_source[begin..].find("slint") {
340                // heuristics to find if we are not in a comment or a string literal. Not perfect, but should work in most cases
341                if let Some(x) = rust_source[begin..(begin + m)].rfind(['\\', '\n', '/', '\"'])
342                    && rust_source.as_bytes()[begin + x] != b'\n'
343                {
344                    begin += m + 5;
345                    begin += rust_source[begin..].find(['\n']).unwrap_or(0);
346                    continue;
347                }
348                begin += m + 5;
349                while rust_source[begin..].starts_with(' ') {
350                    begin += 1;
351                }
352                if !rust_source[begin..].starts_with('!') {
353                    continue;
354                }
355                begin += 1;
356                while rust_source[begin..].starts_with(' ') {
357                    begin += 1;
358                }
359                let Some(open) = rust_source.as_bytes().get(begin) else { continue };
360                match open {
361                    b'{' => break (SyntaxKind::LBrace, SyntaxKind::RBrace),
362                    b'[' => break (SyntaxKind::LBracket, SyntaxKind::RBracket),
363                    b'(' => break (SyntaxKind::LParent, SyntaxKind::RParent),
364                    _ => continue,
365                }
366            } else {
367                // No macro found, just return
368                return None;
369            }
370        };
371
372        begin += 1;
373
374        // Now find the matching closing delimiter
375        // Technically, we should be lexing rust, not slint
376        let mut state = LexState::default();
377        let start = begin;
378        let mut end = begin;
379        let mut level = 0;
380        while !rust_source[end..].is_empty() {
381            let len = match crate::parser::lex_next_token(&rust_source[end..], &mut state) {
382                Some((len, x)) if x == open => {
383                    level += 1;
384                    len
385                }
386                Some((_, x)) if x == close && level == 0 => {
387                    break;
388                }
389                Some((len, x)) if x == close => {
390                    level -= 1;
391                    len
392                }
393                Some((len, _)) => len,
394                None => {
395                    // Lex error
396                    break;
397                }
398            };
399            if len == 0 {
400                break; // Shouldn't happen
401            }
402            end += len;
403        }
404        begin = end;
405        Some(start..end)
406    })
407}
408
409#[test]
410fn test_locate_rust_macro() {
411    #[track_caller]
412    fn do_test(source: &str, captures: &[&str]) {
413        let result = locate_slint_macro(source).map(|r| &source[r]).collect::<Vec<_>>();
414        assert_eq!(&result, captures);
415    }
416
417    do_test("\nslint{!{}}", &[]);
418    do_test(
419        "//slint!(123)\nslint!(456)\nslint ![789]\n/*slint!{abc}*/\nslint! {def}",
420        &["456", "789", "def"],
421    );
422    do_test("slint!(slint!(abc))slint!()", &["slint!(abc)", ""]);
423}
424
425/// Given a Rust source file contents, return a string containing the contents of the first `slint!` macro
426///
427/// All the other bytes which are not newlines are replaced by space. This allow offsets in the resulting
428/// string to preserve line and column number.
429///
430/// The last byte before the Slint area will be \u{2} (ASCII Start-of-Text), the first byte after
431/// the slint code will be \u{3} (ASCII End-of-Text), so that programs can find the area of slint code
432/// within the program.
433///
434/// Note that the slint compiler considers Start-of-Text and End-of-Text as whitespace and will treat them
435/// accordingly.
436pub fn extract_rust_macro(rust_source: String) -> Option<String> {
437    let core::ops::Range { start, end } = locate_slint_macro(&rust_source).next()?;
438    let mut bytes = rust_source.into_bytes();
439    for c in &mut bytes[..start] {
440        if *c != b'\n' {
441            *c = b' '
442        }
443    }
444
445    if start > 0 {
446        bytes[start - 1] = 2;
447    }
448    if end < bytes.len() {
449        bytes[end] = 3;
450
451        for c in &mut bytes[end + 1..] {
452            if *c != b'\n' {
453                *c = b' '
454            }
455        }
456    }
457    Some(String::from_utf8(bytes).expect("We just added spaces"))
458}
459
460#[test]
461fn test_extract_rust_macro() {
462    assert_eq!(extract_rust_macro("\nslint{!{}}".into()), None);
463    assert_eq!(
464        extract_rust_macro(
465            "abc\n€\nslint !  {x \" \\\" }🦀\" { () {}\n {} }xx =}-  ;}\n xxx \n yyy {}\n".into(),
466        ),
467        Some(
468            "   \n   \n         \u{2}x \" \\\" }🦀\" { () {}\n {} }xx =\u{3}     \n     \n       \n".into(),
469        )
470    );
471
472    assert_eq!(
473        extract_rust_macro("xx\nabcd::slint!{abc{}efg".into()),
474        Some("  \n            \u{2}abc{}efg".into())
475    );
476    assert_eq!(
477        extract_rust_macro("slint!\nnot.\nslint!{\nunterminated\nxxx".into()),
478        Some("      \n    \n      \u{2}\nunterminated\nxxx".into())
479    );
480    assert_eq!(extract_rust_macro("foo\n/* slint! { hello }\n".into()), None);
481    assert_eq!(extract_rust_macro("foo\n/* slint::slint! { hello }\n".into()), None);
482    assert_eq!(
483        extract_rust_macro("foo\n// slint! { hello }\nslint!{world}\na".into()),
484        Some("   \n                   \n      \u{2}world\u{3}\n ".into())
485    );
486    assert_eq!(extract_rust_macro("foo\n\" slint! { hello }\"\n".into()), None);
487    assert_eq!(
488        extract_rust_macro(
489            "abc\n€\nslint !  (x /* \\\" )🦀*/ { () {}\n {} }xx =)-  ;}\n xxx \n yyy {}\n".into(),
490        ),
491        Some(
492            "   \n   \n         \u{2}x /* \\\" )🦀*/ { () {}\n {} }xx =\u{3}     \n     \n       \n".into(),
493        )
494    );
495    assert_eq!(
496        extract_rust_macro("abc slint![x slint!() [{[]}] s] abc".into()),
497        Some("          \u{0002}x slint!() [{[]}] s\u{0003}    ".into()),
498    );
499}
i_slint_compiler/lexer.rs

i_slint_compiler/
lexer.rs