typst_syntax/
lexer.rs

1use ecow::{eco_format, EcoString};
2use unicode_ident::{is_xid_continue, is_xid_start};
3use unicode_script::{Script, UnicodeScript};
4use unicode_segmentation::UnicodeSegmentation;
5use unscanny::Scanner;
6
7use crate::{SyntaxError, SyntaxKind, SyntaxNode};
8
9/// An iterator over a source code string which returns tokens.
10#[derive(Clone)]
11pub(super) struct Lexer<'s> {
12    /// The scanner: contains the underlying string and location as a "cursor".
13    s: Scanner<'s>,
14    /// The mode the lexer is in. This determines which kinds of tokens it
15    /// produces.
16    mode: LexMode,
17    /// Whether the last token contained a newline.
18    newline: bool,
19    /// An error for the last token.
20    error: Option<SyntaxError>,
21}
22
23/// What kind of tokens to emit.
24#[derive(Debug, Copy, Clone, Eq, PartialEq)]
25pub(super) enum LexMode {
26    /// Text and markup.
27    Markup,
28    /// Math atoms, operators, etc.
29    Math,
30    /// Keywords, literals and operators.
31    Code,
32}
33
34impl<'s> Lexer<'s> {
35    /// Create a new lexer with the given mode and a prefix to offset column
36    /// calculations.
37    pub fn new(text: &'s str, mode: LexMode) -> Self {
38        Self {
39            s: Scanner::new(text),
40            mode,
41            newline: false,
42            error: None,
43        }
44    }
45
46    /// Get the current lexing mode.
47    pub fn mode(&self) -> LexMode {
48        self.mode
49    }
50
51    /// Change the lexing mode.
52    pub fn set_mode(&mut self, mode: LexMode) {
53        self.mode = mode;
54    }
55
56    /// The index in the string at which the last token ends and next token
57    /// will start.
58    pub fn cursor(&self) -> usize {
59        self.s.cursor()
60    }
61
62    /// Jump to the given index in the string.
63    pub fn jump(&mut self, index: usize) {
64        self.s.jump(index);
65    }
66
67    /// Whether the last token contained a newline.
68    pub fn newline(&self) -> bool {
69        self.newline
70    }
71
72    /// The number of characters until the most recent newline from an index.
73    pub fn column(&self, index: usize) -> usize {
74        let mut s = self.s; // Make a new temporary scanner (cheap).
75        s.jump(index);
76        s.before().chars().rev().take_while(|&c| !is_newline(c)).count()
77    }
78}
79
80impl Lexer<'_> {
81    /// Construct a full-positioned syntax error.
82    fn error(&mut self, message: impl Into<EcoString>) -> SyntaxKind {
83        self.error = Some(SyntaxError::new(message));
84        SyntaxKind::Error
85    }
86
87    /// If the current node is an error, adds a hint.
88    fn hint(&mut self, message: impl Into<EcoString>) {
89        if let Some(error) = &mut self.error {
90            error.hints.push(message.into());
91        }
92    }
93}
94
95/// Shared methods with all [`LexMode`].
96impl Lexer<'_> {
97    /// Return the next token in our text. Returns both the [`SyntaxNode`]
98    /// and the raw [`SyntaxKind`] to make it more ergonomic to check the kind
99    pub fn next(&mut self) -> (SyntaxKind, SyntaxNode) {
100        debug_assert!(self.error.is_none());
101        let start = self.s.cursor();
102
103        self.newline = false;
104        let kind = match self.s.eat() {
105            Some(c) if is_space(c, self.mode) => self.whitespace(start, c),
106            Some('#') if start == 0 && self.s.eat_if('!') => self.shebang(),
107            Some('/') if self.s.eat_if('/') => self.line_comment(),
108            Some('/') if self.s.eat_if('*') => self.block_comment(),
109            Some('*') if self.s.eat_if('/') => {
110                let kind = self.error("unexpected end of block comment");
111                self.hint(
112                    "consider escaping the `*` with a backslash or \
113                     opening the block comment with `/*`",
114                );
115                kind
116            }
117            Some('`') if self.mode != LexMode::Math => return self.raw(),
118            Some(c) => match self.mode {
119                LexMode::Markup => self.markup(start, c),
120                LexMode::Math => match self.math(start, c) {
121                    (kind, None) => kind,
122                    (kind, Some(node)) => return (kind, node),
123                },
124                LexMode::Code => self.code(start, c),
125            },
126
127            None => SyntaxKind::End,
128        };
129
130        let text = self.s.from(start);
131        let node = match self.error.take() {
132            Some(error) => SyntaxNode::error(error, text),
133            None => SyntaxNode::leaf(kind, text),
134        };
135        (kind, node)
136    }
137
138    /// Eat whitespace characters greedily.
139    fn whitespace(&mut self, start: usize, c: char) -> SyntaxKind {
140        let more = self.s.eat_while(|c| is_space(c, self.mode));
141        let newlines = match c {
142            // Optimize eating a single space.
143            ' ' if more.is_empty() => 0,
144            _ => count_newlines(self.s.from(start)),
145        };
146
147        self.newline = newlines > 0;
148        if self.mode == LexMode::Markup && newlines >= 2 {
149            SyntaxKind::Parbreak
150        } else {
151            SyntaxKind::Space
152        }
153    }
154
155    fn shebang(&mut self) -> SyntaxKind {
156        self.s.eat_until(is_newline);
157        SyntaxKind::Shebang
158    }
159
160    fn line_comment(&mut self) -> SyntaxKind {
161        self.s.eat_until(is_newline);
162        SyntaxKind::LineComment
163    }
164
165    fn block_comment(&mut self) -> SyntaxKind {
166        let mut state = '_';
167        let mut depth = 1;
168
169        // Find the first `*/` that does not correspond to a nested `/*`.
170        while let Some(c) = self.s.eat() {
171            state = match (state, c) {
172                ('*', '/') => {
173                    depth -= 1;
174                    if depth == 0 {
175                        break;
176                    }
177                    '_'
178                }
179                ('/', '*') => {
180                    depth += 1;
181                    '_'
182                }
183                _ => c,
184            }
185        }
186
187        SyntaxKind::BlockComment
188    }
189}
190
191/// Markup.
192impl Lexer<'_> {
193    fn markup(&mut self, start: usize, c: char) -> SyntaxKind {
194        match c {
195            '\\' => self.backslash(),
196            'h' if self.s.eat_if("ttp://") => self.link(),
197            'h' if self.s.eat_if("ttps://") => self.link(),
198            '<' if self.s.at(is_id_continue) => self.label(),
199            '@' => self.ref_marker(),
200
201            '.' if self.s.eat_if("..") => SyntaxKind::Shorthand,
202            '-' if self.s.eat_if("--") => SyntaxKind::Shorthand,
203            '-' if self.s.eat_if('-') => SyntaxKind::Shorthand,
204            '-' if self.s.eat_if('?') => SyntaxKind::Shorthand,
205            '-' if self.s.at(char::is_numeric) => SyntaxKind::Shorthand,
206            '*' if !self.in_word() => SyntaxKind::Star,
207            '_' if !self.in_word() => SyntaxKind::Underscore,
208
209            '#' => SyntaxKind::Hash,
210            '[' => SyntaxKind::LeftBracket,
211            ']' => SyntaxKind::RightBracket,
212            '\'' => SyntaxKind::SmartQuote,
213            '"' => SyntaxKind::SmartQuote,
214            '$' => SyntaxKind::Dollar,
215            '~' => SyntaxKind::Shorthand,
216            ':' => SyntaxKind::Colon,
217            '=' => {
218                self.s.eat_while('=');
219                if self.space_or_end() {
220                    SyntaxKind::HeadingMarker
221                } else {
222                    self.text()
223                }
224            }
225            '-' if self.space_or_end() => SyntaxKind::ListMarker,
226            '+' if self.space_or_end() => SyntaxKind::EnumMarker,
227            '/' if self.space_or_end() => SyntaxKind::TermMarker,
228            '0'..='9' => self.numbering(start),
229
230            _ => self.text(),
231        }
232    }
233
234    fn backslash(&mut self) -> SyntaxKind {
235        if self.s.eat_if("u{") {
236            let hex = self.s.eat_while(char::is_ascii_alphanumeric);
237            if !self.s.eat_if('}') {
238                return self.error("unclosed Unicode escape sequence");
239            }
240
241            if u32::from_str_radix(hex, 16)
242                .ok()
243                .and_then(std::char::from_u32)
244                .is_none()
245            {
246                return self.error(eco_format!("invalid Unicode codepoint: {}", hex));
247            }
248
249            return SyntaxKind::Escape;
250        }
251
252        if self.s.done() || self.s.at(char::is_whitespace) {
253            SyntaxKind::Linebreak
254        } else {
255            self.s.eat();
256            SyntaxKind::Escape
257        }
258    }
259
260    /// We parse entire raw segments in the lexer as a convenience to avoid
261    /// going to and from the parser for each raw section. See comments in
262    /// [`Self::blocky_raw`] and [`Self::inline_raw`] for specific details.
263    fn raw(&mut self) -> (SyntaxKind, SyntaxNode) {
264        let start = self.s.cursor() - 1;
265
266        // Determine number of opening backticks.
267        let mut backticks = 1;
268        while self.s.eat_if('`') {
269            backticks += 1;
270        }
271
272        // Special case for ``.
273        if backticks == 2 {
274            let nodes = vec![
275                SyntaxNode::leaf(SyntaxKind::RawDelim, "`"),
276                SyntaxNode::leaf(SyntaxKind::RawDelim, "`"),
277            ];
278            return (SyntaxKind::Raw, SyntaxNode::inner(SyntaxKind::Raw, nodes));
279        }
280
281        // Find end of raw text.
282        let mut found = 0;
283        while found < backticks {
284            match self.s.eat() {
285                Some('`') => found += 1,
286                Some(_) => found = 0,
287                None => {
288                    let msg = SyntaxError::new("unclosed raw text");
289                    let error = SyntaxNode::error(msg, self.s.from(start));
290                    return (SyntaxKind::Error, error);
291                }
292            }
293        }
294        let end = self.s.cursor();
295
296        let mut nodes = Vec::with_capacity(3); // Will have at least 3.
297
298        // A closure for pushing a node onto our raw vector. Assumes the caller
299        // will move the scanner to the next location at each step.
300        let mut prev_start = start;
301        let mut push_raw = |kind, s: &Scanner| {
302            nodes.push(SyntaxNode::leaf(kind, s.from(prev_start)));
303            prev_start = s.cursor();
304        };
305
306        // Opening delimiter.
307        self.s.jump(start + backticks);
308        push_raw(SyntaxKind::RawDelim, &self.s);
309
310        if backticks >= 3 {
311            self.blocky_raw(end - backticks, &mut push_raw);
312        } else {
313            self.inline_raw(end - backticks, &mut push_raw);
314        }
315
316        // Closing delimiter.
317        self.s.jump(end);
318        push_raw(SyntaxKind::RawDelim, &self.s);
319
320        (SyntaxKind::Raw, SyntaxNode::inner(SyntaxKind::Raw, nodes))
321    }
322
323    /// Raw blocks parse a language tag, have smart behavior for trimming
324    /// whitespace in the start/end lines, and trim common leading whitespace
325    /// from all other lines as the "dedent". The exact behavior is described
326    /// below.
327    ///
328    /// ### The initial line:
329    /// - A valid Typst identifier immediately following the opening delimiter
330    ///   is parsed as the language tag.
331    /// - We check the rest of the line and if all characters are whitespace,
332    ///   trim it. Otherwise we trim a single leading space if present.
333    ///   - If more trimmed characters follow on future lines, they will be
334    ///     merged into the same trimmed element.
335    /// - If we didn't trim the entire line, the rest is kept as text.
336    ///
337    /// ### Inner lines:
338    /// - We determine the "dedent" by iterating over the lines. The dedent is
339    ///   the minimum number of leading whitespace characters (not bytes) before
340    ///   each line that has any non-whitespace characters.
341    ///   - The opening delimiter's line does not contribute to the dedent, but
342    ///     the closing delimiter's line does (even if that line is entirely
343    ///     whitespace up to the delimiter).
344    /// - We then trim the newline and dedent characters of each line, and add a
345    ///   (potentially empty) text element of all remaining characters.
346    ///
347    /// ### The final line:
348    /// - If the last line is entirely whitespace, it is trimmed.
349    /// - Otherwise its text is kept like an inner line. However, if the last
350    ///   non-whitespace character of the final line is a backtick, then one
351    ///   ascii space (if present) is trimmed from the end.
352    fn blocky_raw<F>(&mut self, inner_end: usize, mut push_raw: F)
353    where
354        F: FnMut(SyntaxKind, &Scanner),
355    {
356        // Language tag.
357        if self.s.eat_if(is_id_start) {
358            self.s.eat_while(is_id_continue);
359            push_raw(SyntaxKind::RawLang, &self.s);
360        }
361
362        // The rest of the function operates on the lines between the backticks.
363        let mut lines = split_newlines(self.s.to(inner_end));
364
365        // Determine dedent level.
366        let dedent = lines
367            .iter()
368            .skip(1)
369            .filter(|line| !line.chars().all(char::is_whitespace))
370            // The line with the closing ``` is always taken into account
371            .chain(lines.last())
372            .map(|line| line.chars().take_while(|c| c.is_whitespace()).count())
373            .min()
374            .unwrap_or(0);
375
376        // Trim whitespace from the last line. Will be added as a `RawTrimmed`
377        // kind by the check for `self.s.cursor() != inner_end` below.
378        if lines.last().is_some_and(|last| last.chars().all(char::is_whitespace)) {
379            lines.pop();
380        } else if let Some(last) = lines.last_mut() {
381            // If last line ends in a backtick, try to trim a single space. This
382            // check must happen before we add the first line since the last and
383            // first lines might be the same.
384            if last.trim_end().ends_with('`') {
385                *last = last.strip_suffix(' ').unwrap_or(last);
386            }
387        }
388
389        let mut lines = lines.into_iter();
390
391        // Handle the first line: trim if all whitespace, or trim a single space
392        // at the start. Note that the first line does not affect the dedent
393        // value.
394        if let Some(first_line) = lines.next() {
395            if first_line.chars().all(char::is_whitespace) {
396                self.s.advance(first_line.len());
397                // This is the only spot we advance the scanner, but don't
398                // immediately call `push_raw`. But the rest of the function
399                // ensures we will always add this text to a `RawTrimmed` later.
400                debug_assert!(self.s.cursor() != inner_end);
401                // A proof by cases follows:
402                // # First case: The loop runs
403                // If the loop runs, there must be a newline following, so
404                // `cursor != inner_end`. And if the loop runs, the first thing
405                // it does is add a trimmed element.
406                // # Second case: The final if-statement runs.
407                // To _not_ reach the loop from here, we must have only one or
408                // two lines:
409                // 1. If one line, we cannot be here, because the first and last
410                //    lines are the same, so this line will have been removed by
411                //    the check for the last line being all whitespace.
412                // 2. If two lines, the loop will run unless the last is fully
413                //    whitespace, but if it is, it will have been popped, then
414                //    the final if-statement will run because the text removed
415                //    by the last line must include at least a newline, so
416                //    `cursor != inner_end` here.
417            } else {
418                let line_end = self.s.cursor() + first_line.len();
419                if self.s.eat_if(' ') {
420                    // Trim a single space after the lang tag on the first line.
421                    push_raw(SyntaxKind::RawTrimmed, &self.s);
422                }
423                // We know here that the rest of the line is non-empty.
424                self.s.jump(line_end);
425                push_raw(SyntaxKind::Text, &self.s);
426            }
427        }
428
429        // Add lines.
430        for line in lines {
431            let offset: usize = line.chars().take(dedent).map(char::len_utf8).sum();
432            self.s.eat_newline();
433            self.s.advance(offset);
434            push_raw(SyntaxKind::RawTrimmed, &self.s);
435            self.s.advance(line.len() - offset);
436            push_raw(SyntaxKind::Text, &self.s);
437        }
438
439        // Add final trimmed.
440        if self.s.cursor() < inner_end {
441            self.s.jump(inner_end);
442            push_raw(SyntaxKind::RawTrimmed, &self.s);
443        }
444    }
445
446    /// Inline raw text is split on lines with non-newlines as `Text` kinds and
447    /// newlines as `RawTrimmed`. Inline raw text does not dedent the text, all
448    /// non-newline whitespace is kept.
449    fn inline_raw<F>(&mut self, inner_end: usize, mut push_raw: F)
450    where
451        F: FnMut(SyntaxKind, &Scanner),
452    {
453        while self.s.cursor() < inner_end {
454            if self.s.at(is_newline) {
455                push_raw(SyntaxKind::Text, &self.s);
456                self.s.eat_newline();
457                push_raw(SyntaxKind::RawTrimmed, &self.s);
458                continue;
459            }
460            self.s.eat();
461        }
462        push_raw(SyntaxKind::Text, &self.s);
463    }
464
465    fn link(&mut self) -> SyntaxKind {
466        let (link, balanced) = link_prefix(self.s.after());
467        self.s.advance(link.len());
468
469        if !balanced {
470            return self.error(
471                "automatic links cannot contain unbalanced brackets, \
472                 use the `link` function instead",
473            );
474        }
475
476        SyntaxKind::Link
477    }
478
479    fn numbering(&mut self, start: usize) -> SyntaxKind {
480        self.s.eat_while(char::is_ascii_digit);
481
482        let read = self.s.from(start);
483        if self.s.eat_if('.') && self.space_or_end() && read.parse::<usize>().is_ok() {
484            return SyntaxKind::EnumMarker;
485        }
486
487        self.text()
488    }
489
490    fn ref_marker(&mut self) -> SyntaxKind {
491        self.s.eat_while(is_valid_in_label_literal);
492
493        // Don't include the trailing characters likely to be part of text.
494        while matches!(self.s.scout(-1), Some('.' | ':')) {
495            self.s.uneat();
496        }
497
498        SyntaxKind::RefMarker
499    }
500
501    fn label(&mut self) -> SyntaxKind {
502        let label = self.s.eat_while(is_valid_in_label_literal);
503        if label.is_empty() {
504            return self.error("label cannot be empty");
505        }
506
507        if !self.s.eat_if('>') {
508            return self.error("unclosed label");
509        }
510
511        SyntaxKind::Label
512    }
513
514    fn text(&mut self) -> SyntaxKind {
515        macro_rules! table {
516            ($(|$c:literal)*) => {
517                static TABLE: [bool; 128] = {
518                    let mut t = [false; 128];
519                    $(t[$c as usize] = true;)*
520                    t
521                };
522            };
523        }
524
525        table! {
526            | ' ' | '\t' | '\n' | '\x0b' | '\x0c' | '\r' | '\\' | '/'
527            | '[' | ']' | '~' | '-' | '.' | '\'' | '"' | '*' | '_'
528            | ':' | 'h' | '`' | '$' | '<' | '>' | '@' | '#'
529        };
530
531        loop {
532            self.s.eat_until(|c: char| {
533                TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace())
534            });
535
536            // Continue with the same text node if the thing would become text
537            // anyway.
538            let mut s = self.s;
539            match s.eat() {
540                Some(' ') if s.at(char::is_alphanumeric) => {}
541                Some('/') if !s.at(['/', '*']) => {}
542                Some('-') if !s.at(['-', '?']) => {}
543                Some('.') if !s.at("..") => {}
544                Some('h') if !s.at("ttp://") && !s.at("ttps://") => {}
545                Some('@') if !s.at(is_valid_in_label_literal) => {}
546                _ => break,
547            }
548
549            self.s = s;
550        }
551
552        SyntaxKind::Text
553    }
554
555    fn in_word(&self) -> bool {
556        let wordy = |c: Option<char>| {
557            c.is_some_and(|c| {
558                c.is_alphanumeric()
559                    && !matches!(
560                        c.script(),
561                        Script::Han
562                            | Script::Hiragana
563                            | Script::Katakana
564                            | Script::Hangul
565                    )
566            })
567        };
568        let prev = self.s.scout(-2);
569        let next = self.s.peek();
570        wordy(prev) && wordy(next)
571    }
572
573    fn space_or_end(&self) -> bool {
574        self.s.done()
575            || self.s.at(char::is_whitespace)
576            || self.s.at("//")
577            || self.s.at("/*")
578    }
579}
580
581/// Math.
582impl Lexer<'_> {
583    fn math(&mut self, start: usize, c: char) -> (SyntaxKind, Option<SyntaxNode>) {
584        let kind = match c {
585            '\\' => self.backslash(),
586            '"' => self.string(),
587
588            '-' if self.s.eat_if(">>") => SyntaxKind::MathShorthand,
589            '-' if self.s.eat_if('>') => SyntaxKind::MathShorthand,
590            '-' if self.s.eat_if("->") => SyntaxKind::MathShorthand,
591            ':' if self.s.eat_if('=') => SyntaxKind::MathShorthand,
592            ':' if self.s.eat_if(":=") => SyntaxKind::MathShorthand,
593            '!' if self.s.eat_if('=') => SyntaxKind::MathShorthand,
594            '.' if self.s.eat_if("..") => SyntaxKind::MathShorthand,
595            '[' if self.s.eat_if('|') => SyntaxKind::MathShorthand,
596            '<' if self.s.eat_if("==>") => SyntaxKind::MathShorthand,
597            '<' if self.s.eat_if("-->") => SyntaxKind::MathShorthand,
598            '<' if self.s.eat_if("--") => SyntaxKind::MathShorthand,
599            '<' if self.s.eat_if("-<") => SyntaxKind::MathShorthand,
600            '<' if self.s.eat_if("->") => SyntaxKind::MathShorthand,
601            '<' if self.s.eat_if("<-") => SyntaxKind::MathShorthand,
602            '<' if self.s.eat_if("<<") => SyntaxKind::MathShorthand,
603            '<' if self.s.eat_if("=>") => SyntaxKind::MathShorthand,
604            '<' if self.s.eat_if("==") => SyntaxKind::MathShorthand,
605            '<' if self.s.eat_if("~~") => SyntaxKind::MathShorthand,
606            '<' if self.s.eat_if('=') => SyntaxKind::MathShorthand,
607            '<' if self.s.eat_if('<') => SyntaxKind::MathShorthand,
608            '<' if self.s.eat_if('-') => SyntaxKind::MathShorthand,
609            '<' if self.s.eat_if('~') => SyntaxKind::MathShorthand,
610            '>' if self.s.eat_if("->") => SyntaxKind::MathShorthand,
611            '>' if self.s.eat_if(">>") => SyntaxKind::MathShorthand,
612            '=' if self.s.eat_if("=>") => SyntaxKind::MathShorthand,
613            '=' if self.s.eat_if('>') => SyntaxKind::MathShorthand,
614            '=' if self.s.eat_if(':') => SyntaxKind::MathShorthand,
615            '>' if self.s.eat_if('=') => SyntaxKind::MathShorthand,
616            '>' if self.s.eat_if('>') => SyntaxKind::MathShorthand,
617            '|' if self.s.eat_if("->") => SyntaxKind::MathShorthand,
618            '|' if self.s.eat_if("=>") => SyntaxKind::MathShorthand,
619            '|' if self.s.eat_if(']') => SyntaxKind::MathShorthand,
620            '|' if self.s.eat_if('|') => SyntaxKind::MathShorthand,
621            '~' if self.s.eat_if("~>") => SyntaxKind::MathShorthand,
622            '~' if self.s.eat_if('>') => SyntaxKind::MathShorthand,
623            '*' | '-' | '~' => SyntaxKind::MathShorthand,
624
625            '.' => SyntaxKind::Dot,
626            ',' => SyntaxKind::Comma,
627            ';' => SyntaxKind::Semicolon,
628            ')' => SyntaxKind::RightParen,
629
630            '#' => SyntaxKind::Hash,
631            '_' => SyntaxKind::Underscore,
632            '$' => SyntaxKind::Dollar,
633            '/' => SyntaxKind::Slash,
634            '^' => SyntaxKind::Hat,
635            '\'' => SyntaxKind::Prime,
636            '&' => SyntaxKind::MathAlignPoint,
637            '√' | '∛' | '∜' => SyntaxKind::Root,
638
639            // Identifiers.
640            c if is_math_id_start(c) && self.s.at(is_math_id_continue) => {
641                self.s.eat_while(is_math_id_continue);
642                let (kind, node) = self.math_ident_or_field(start);
643                return (kind, Some(node));
644            }
645
646            // Other math atoms.
647            _ => self.math_text(start, c),
648        };
649        (kind, None)
650    }
651
652    /// Parse a single `MathIdent` or an entire `FieldAccess`.
653    fn math_ident_or_field(&mut self, start: usize) -> (SyntaxKind, SyntaxNode) {
654        let mut kind = SyntaxKind::MathIdent;
655        let mut node = SyntaxNode::leaf(kind, self.s.from(start));
656        while let Some(ident) = self.maybe_dot_ident() {
657            kind = SyntaxKind::FieldAccess;
658            let field_children = vec![
659                node,
660                SyntaxNode::leaf(SyntaxKind::Dot, '.'),
661                SyntaxNode::leaf(SyntaxKind::Ident, ident),
662            ];
663            node = SyntaxNode::inner(kind, field_children);
664        }
665        (kind, node)
666    }
667
668    /// If at a dot and a math identifier, eat and return the identifier.
669    fn maybe_dot_ident(&mut self) -> Option<&str> {
670        if self.s.scout(1).is_some_and(is_math_id_start) && self.s.eat_if('.') {
671            let ident_start = self.s.cursor();
672            self.s.eat();
673            self.s.eat_while(is_math_id_continue);
674            Some(self.s.from(ident_start))
675        } else {
676            None
677        }
678    }
679
680    fn math_text(&mut self, start: usize, c: char) -> SyntaxKind {
681        // Keep numbers and grapheme clusters together.
682        if c.is_numeric() {
683            self.s.eat_while(char::is_numeric);
684            let mut s = self.s;
685            if s.eat_if('.') && !s.eat_while(char::is_numeric).is_empty() {
686                self.s = s;
687            }
688            SyntaxKind::MathText
689        } else {
690            let len = self
691                .s
692                .get(start..self.s.string().len())
693                .graphemes(true)
694                .next()
695                .map_or(0, str::len);
696            self.s.jump(start + len);
697            if len > c.len_utf8() {
698                // Grapheme clusters are treated as normal text and stay grouped
699                // This may need to change in the future.
700                SyntaxKind::Text
701            } else {
702                SyntaxKind::MathText
703            }
704        }
705    }
706
707    /// Handle named arguments in math function call.
708    pub fn maybe_math_named_arg(&mut self, start: usize) -> Option<SyntaxNode> {
709        let cursor = self.s.cursor();
710        self.s.jump(start);
711        if self.s.eat_if(is_id_start) {
712            self.s.eat_while(is_id_continue);
713            // Check that a colon directly follows the identifier, and not the
714            // `:=` or `::=` math shorthands.
715            if self.s.at(':') && !self.s.at(":=") && !self.s.at("::=") {
716                // Check that the identifier is not just `_`.
717                let node = if self.s.from(start) != "_" {
718                    SyntaxNode::leaf(SyntaxKind::Ident, self.s.from(start))
719                } else {
720                    let msg = SyntaxError::new("expected identifier, found underscore");
721                    SyntaxNode::error(msg, self.s.from(start))
722                };
723                return Some(node);
724            }
725        }
726        self.s.jump(cursor);
727        None
728    }
729
730    /// Handle spread arguments in math function call.
731    pub fn maybe_math_spread_arg(&mut self, start: usize) -> Option<SyntaxNode> {
732        let cursor = self.s.cursor();
733        self.s.jump(start);
734        if self.s.eat_if("..") {
735            // Check that neither a space nor a dot follows the spread syntax.
736            // A dot would clash with the `...` math shorthand.
737            if !self.space_or_end() && !self.s.at('.') {
738                let node = SyntaxNode::leaf(SyntaxKind::Dots, self.s.from(start));
739                return Some(node);
740            }
741        }
742        self.s.jump(cursor);
743        None
744    }
745}
746
747/// Code.
748impl Lexer<'_> {
749    fn code(&mut self, start: usize, c: char) -> SyntaxKind {
750        match c {
751            '<' if self.s.at(is_id_continue) => self.label(),
752            '0'..='9' => self.number(start, c),
753            '.' if self.s.at(char::is_ascii_digit) => self.number(start, c),
754            '"' => self.string(),
755
756            '=' if self.s.eat_if('=') => SyntaxKind::EqEq,
757            '!' if self.s.eat_if('=') => SyntaxKind::ExclEq,
758            '<' if self.s.eat_if('=') => SyntaxKind::LtEq,
759            '>' if self.s.eat_if('=') => SyntaxKind::GtEq,
760            '+' if self.s.eat_if('=') => SyntaxKind::PlusEq,
761            '-' | '\u{2212}' if self.s.eat_if('=') => SyntaxKind::HyphEq,
762            '*' if self.s.eat_if('=') => SyntaxKind::StarEq,
763            '/' if self.s.eat_if('=') => SyntaxKind::SlashEq,
764            '.' if self.s.eat_if('.') => SyntaxKind::Dots,
765            '=' if self.s.eat_if('>') => SyntaxKind::Arrow,
766
767            '{' => SyntaxKind::LeftBrace,
768            '}' => SyntaxKind::RightBrace,
769            '[' => SyntaxKind::LeftBracket,
770            ']' => SyntaxKind::RightBracket,
771            '(' => SyntaxKind::LeftParen,
772            ')' => SyntaxKind::RightParen,
773            '$' => SyntaxKind::Dollar,
774            ',' => SyntaxKind::Comma,
775            ';' => SyntaxKind::Semicolon,
776            ':' => SyntaxKind::Colon,
777            '.' => SyntaxKind::Dot,
778            '+' => SyntaxKind::Plus,
779            '-' | '\u{2212}' => SyntaxKind::Minus,
780            '*' => SyntaxKind::Star,
781            '/' => SyntaxKind::Slash,
782            '=' => SyntaxKind::Eq,
783            '<' => SyntaxKind::Lt,
784            '>' => SyntaxKind::Gt,
785
786            c if is_id_start(c) => self.ident(start),
787
788            c => self.error(eco_format!("the character `{c}` is not valid in code")),
789        }
790    }
791
792    fn ident(&mut self, start: usize) -> SyntaxKind {
793        self.s.eat_while(is_id_continue);
794        let ident = self.s.from(start);
795
796        let prev = self.s.get(0..start);
797        if !prev.ends_with(['.', '@']) || prev.ends_with("..") {
798            if let Some(keyword) = keyword(ident) {
799                return keyword;
800            }
801        }
802
803        if ident == "_" {
804            SyntaxKind::Underscore
805        } else {
806            SyntaxKind::Ident
807        }
808    }
809
810    fn number(&mut self, mut start: usize, c: char) -> SyntaxKind {
811        // Handle alternative integer bases.
812        let mut base = 10;
813        if c == '0' {
814            if self.s.eat_if('b') {
815                base = 2;
816            } else if self.s.eat_if('o') {
817                base = 8;
818            } else if self.s.eat_if('x') {
819                base = 16;
820            }
821            if base != 10 {
822                start = self.s.cursor();
823            }
824        }
825
826        // Read the first part (integer or fractional depending on `first`).
827        self.s.eat_while(if base == 16 {
828            char::is_ascii_alphanumeric
829        } else {
830            char::is_ascii_digit
831        });
832
833        // Read the fractional part if not already done.
834        // Make sure not to confuse a range for the decimal separator.
835        if c != '.'
836            && !self.s.at("..")
837            && !self.s.scout(1).is_some_and(is_id_start)
838            && self.s.eat_if('.')
839            && base == 10
840        {
841            self.s.eat_while(char::is_ascii_digit);
842        }
843
844        // Read the exponent.
845        if !self.s.at("em") && self.s.eat_if(['e', 'E']) && base == 10 {
846            self.s.eat_if(['+', '-']);
847            self.s.eat_while(char::is_ascii_digit);
848        }
849
850        // Read the suffix.
851        let suffix_start = self.s.cursor();
852        if !self.s.eat_if('%') {
853            self.s.eat_while(char::is_ascii_alphanumeric);
854        }
855
856        let number = self.s.get(start..suffix_start);
857        let suffix = self.s.from(suffix_start);
858
859        let kind = if i64::from_str_radix(number, base).is_ok() {
860            SyntaxKind::Int
861        } else if base == 10 && number.parse::<f64>().is_ok() {
862            SyntaxKind::Float
863        } else {
864            return self.error(match base {
865                2 => eco_format!("invalid binary number: 0b{}", number),
866                8 => eco_format!("invalid octal number: 0o{}", number),
867                16 => eco_format!("invalid hexadecimal number: 0x{}", number),
868                _ => eco_format!("invalid number: {}", number),
869            });
870        };
871
872        if suffix.is_empty() {
873            return kind;
874        }
875
876        if !matches!(
877            suffix,
878            "pt" | "mm" | "cm" | "in" | "deg" | "rad" | "em" | "fr" | "%"
879        ) {
880            return self.error(eco_format!("invalid number suffix: {}", suffix));
881        }
882
883        if base != 10 {
884            let kind = self.error(eco_format!("invalid base-{base} prefix"));
885            self.hint("numbers with a unit cannot have a base prefix");
886            return kind;
887        }
888
889        SyntaxKind::Numeric
890    }
891
892    fn string(&mut self) -> SyntaxKind {
893        let mut escaped = false;
894        self.s.eat_until(|c| {
895            let stop = c == '"' && !escaped;
896            escaped = c == '\\' && !escaped;
897            stop
898        });
899
900        if !self.s.eat_if('"') {
901            return self.error("unclosed string");
902        }
903
904        SyntaxKind::Str
905    }
906}
907
908/// Try to parse an identifier into a keyword.
909fn keyword(ident: &str) -> Option<SyntaxKind> {
910    Some(match ident {
911        "none" => SyntaxKind::None,
912        "auto" => SyntaxKind::Auto,
913        "true" => SyntaxKind::Bool,
914        "false" => SyntaxKind::Bool,
915        "not" => SyntaxKind::Not,
916        "and" => SyntaxKind::And,
917        "or" => SyntaxKind::Or,
918        "let" => SyntaxKind::Let,
919        "set" => SyntaxKind::Set,
920        "show" => SyntaxKind::Show,
921        "context" => SyntaxKind::Context,
922        "if" => SyntaxKind::If,
923        "else" => SyntaxKind::Else,
924        "for" => SyntaxKind::For,
925        "in" => SyntaxKind::In,
926        "while" => SyntaxKind::While,
927        "break" => SyntaxKind::Break,
928        "continue" => SyntaxKind::Continue,
929        "return" => SyntaxKind::Return,
930        "import" => SyntaxKind::Import,
931        "include" => SyntaxKind::Include,
932        "as" => SyntaxKind::As,
933        _ => return None,
934    })
935}
936
937trait ScannerExt {
938    fn advance(&mut self, by: usize);
939    fn eat_newline(&mut self) -> bool;
940}
941
942impl ScannerExt for Scanner<'_> {
943    fn advance(&mut self, by: usize) {
944        self.jump(self.cursor() + by);
945    }
946
947    fn eat_newline(&mut self) -> bool {
948        let ate = self.eat_if(is_newline);
949        if ate && self.before().ends_with('\r') {
950            self.eat_if('\n');
951        }
952        ate
953    }
954}
955
956/// Whether a character will become a [`SyntaxKind::Space`] token.
957#[inline]
958fn is_space(character: char, mode: LexMode) -> bool {
959    match mode {
960        LexMode::Markup => matches!(character, ' ' | '\t') || is_newline(character),
961        _ => character.is_whitespace(),
962    }
963}
964
965/// Whether a character is interpreted as a newline by Typst.
966#[inline]
967pub fn is_newline(character: char) -> bool {
968    matches!(
969        character,
970        // Line Feed, Vertical Tab, Form Feed, Carriage Return.
971        '\n' | '\x0B' | '\x0C' | '\r' |
972        // Next Line, Line Separator, Paragraph Separator.
973        '\u{0085}' | '\u{2028}' | '\u{2029}'
974    )
975}
976
977/// Extracts a prefix of the text that is a link and also returns whether the
978/// parentheses and brackets in the link were balanced.
979pub fn link_prefix(text: &str) -> (&str, bool) {
980    let mut s = unscanny::Scanner::new(text);
981    let mut brackets = Vec::new();
982
983    #[rustfmt::skip]
984    s.eat_while(|c: char| {
985        match c {
986            | '0' ..= '9'
987            | 'a' ..= 'z'
988            | 'A' ..= 'Z'
989            | '!' | '#' | '$' | '%' | '&' | '*' | '+'
990            | ',' | '-' | '.' | '/' | ':' | ';' | '='
991            | '?' | '@' | '_' | '~' | '\'' => true,
992            '[' => {
993                brackets.push(b'[');
994                true
995            }
996            '(' => {
997                brackets.push(b'(');
998                true
999            }
1000            ']' => brackets.pop() == Some(b'['),
1001            ')' => brackets.pop() == Some(b'('),
1002            _ => false,
1003        }
1004    });
1005
1006    // Don't include the trailing characters likely to be part of text.
1007    while matches!(s.scout(-1), Some('!' | ',' | '.' | ':' | ';' | '?' | '\'')) {
1008        s.uneat();
1009    }
1010
1011    (s.before(), brackets.is_empty())
1012}
1013
1014/// Split text at newlines. These newline characters are not kept.
1015pub fn split_newlines(text: &str) -> Vec<&str> {
1016    let mut s = Scanner::new(text);
1017    let mut lines = Vec::new();
1018    let mut start = 0;
1019    let mut end = 0;
1020
1021    while let Some(c) = s.eat() {
1022        if is_newline(c) {
1023            if c == '\r' {
1024                s.eat_if('\n');
1025            }
1026
1027            lines.push(&text[start..end]);
1028            start = s.cursor();
1029        }
1030        end = s.cursor();
1031    }
1032
1033    lines.push(&text[start..]);
1034    lines
1035}
1036
1037/// Count the number of newlines in text.
1038fn count_newlines(text: &str) -> usize {
1039    let mut newlines = 0;
1040    let mut s = Scanner::new(text);
1041    while let Some(c) = s.eat() {
1042        if is_newline(c) {
1043            if c == '\r' {
1044                s.eat_if('\n');
1045            }
1046            newlines += 1;
1047        }
1048    }
1049    newlines
1050}
1051
1052/// Whether a string is a valid Typst identifier.
1053///
1054/// In addition to what is specified in the [Unicode Standard][uax31], we allow:
1055/// - `_` as a starting character,
1056/// - `_` and `-` as continuing characters.
1057///
1058/// [uax31]: http://www.unicode.org/reports/tr31/
1059#[inline]
1060pub fn is_ident(string: &str) -> bool {
1061    let mut chars = string.chars();
1062    chars
1063        .next()
1064        .is_some_and(|c| is_id_start(c) && chars.all(is_id_continue))
1065}
1066
1067/// Whether a character can start an identifier.
1068#[inline]
1069pub fn is_id_start(c: char) -> bool {
1070    is_xid_start(c) || c == '_'
1071}
1072
1073/// Whether a character can continue an identifier.
1074#[inline]
1075pub fn is_id_continue(c: char) -> bool {
1076    is_xid_continue(c) || c == '_' || c == '-'
1077}
1078
1079/// Whether a character can start an identifier in math.
1080#[inline]
1081fn is_math_id_start(c: char) -> bool {
1082    is_xid_start(c)
1083}
1084
1085/// Whether a character can continue an identifier in math.
1086#[inline]
1087fn is_math_id_continue(c: char) -> bool {
1088    is_xid_continue(c) && c != '_'
1089}
1090
1091/// Whether a character can be part of a label literal's name.
1092#[inline]
1093fn is_valid_in_label_literal(c: char) -> bool {
1094    is_id_continue(c) || matches!(c, ':' | '.')
1095}
1096
1097/// Returns true if this string is valid in a label literal.
1098pub fn is_valid_label_literal_id(id: &str) -> bool {
1099    !id.is_empty() && id.chars().all(is_valid_in_label_literal)
1100}
typst_syntax/lexer.rs

typst_syntax/
lexer.rs