Skip to main content

typst_syntax/
lexer.rs

1use std::num::IntErrorKind;
2
3use ecow::{EcoString, EcoVec, eco_format, eco_vec};
4use typst_utils::default_math_class;
5use unicode_ident::{is_xid_continue, is_xid_start};
6use unicode_math_class::MathClass;
7use unicode_script::{Script, UnicodeScript};
8use unicode_segmentation::UnicodeSegmentation;
9use unscanny::Scanner;
10
11use crate::{SyntaxKind, SyntaxMode, SyntaxNode};
12
13/// An iterator over a source code string which returns tokens.
14#[derive(Clone)]
15pub(super) struct Lexer<'s> {
16    /// The scanner: contains the underlying string and location as a "cursor".
17    s: Scanner<'s>,
18    /// The mode the lexer is in. This determines which kinds of tokens it
19    /// produces.
20    mode: SyntaxMode,
21    /// Whether the last token contained a newline.
22    newline: bool,
23    /// An error plus hints for the current token being produced. This is always
24    /// `None` between calls to [`Lexer::next`].
25    error: Option<(EcoString, EcoVec<EcoString>)>,
26}
27
28impl<'s> Lexer<'s> {
29    /// Create a new lexer with the given mode and a prefix to offset column
30    /// calculations.
31    pub fn new(text: &'s str, mode: SyntaxMode) -> Self {
32        Self {
33            s: Scanner::new(text),
34            mode,
35            newline: false,
36            error: None,
37        }
38    }
39
40    /// Get the current lexing mode.
41    pub fn mode(&self) -> SyntaxMode {
42        self.mode
43    }
44
45    /// Change the lexing mode.
46    pub fn set_mode(&mut self, mode: SyntaxMode) {
47        self.mode = mode;
48    }
49
50    /// The index in the string at which the last token ends and next token
51    /// will start.
52    pub fn cursor(&self) -> usize {
53        self.s.cursor()
54    }
55
56    /// Jump to the given index in the string.
57    pub fn jump(&mut self, index: usize) {
58        self.s.jump(index);
59    }
60
61    /// Whether the last token contained a newline.
62    pub fn newline(&self) -> bool {
63        self.newline
64    }
65
66    /// The number of characters until the most recent newline from an index.
67    pub fn column(&self, index: usize) -> usize {
68        let mut s = self.s; // Make a new temporary scanner (cheap).
69        s.jump(index);
70        s.before().chars().rev().take_while(|&c| !is_newline(c)).count()
71    }
72}
73
74impl Lexer<'_> {
75    /// Construct a full-positioned syntax error.
76    fn error(&mut self, message: impl Into<EcoString>) -> SyntaxKind {
77        debug_assert!(self.error.is_none());
78        self.error = Some((message.into(), eco_vec![]));
79        SyntaxKind::Error
80    }
81
82    /// If the current node is an error, adds a hint.
83    fn hint(&mut self, message: impl Into<EcoString>) {
84        if let Some((_message, hints)) = &mut self.error {
85            hints.push(message.into());
86        }
87    }
88}
89
90/// Shared methods with all [`SyntaxMode`].
91impl Lexer<'_> {
92    /// Return the next token in our text. Returns both the [`SyntaxNode`]
93    /// and the raw [`SyntaxKind`] to make it more ergonomic to check the kind
94    pub fn next(&mut self) -> (SyntaxKind, SyntaxNode) {
95        debug_assert!(self.error.is_none());
96        let start = self.s.cursor();
97
98        self.newline = false;
99        let kind = match self.s.eat() {
100            Some(c) if is_space(c, self.mode) => self.whitespace(start, c),
101            Some('#') if start == 0 && self.s.eat_if('!') => self.shebang(),
102            Some('/') if self.s.eat_if('/') => self.line_comment(),
103            Some('/') if self.s.eat_if('*') => self.block_comment(),
104            Some('*') if self.s.eat_if('/') => {
105                let error = self.error("unexpected end of block comment");
106                self.hint(
107                    "consider escaping the `*` with a backslash or \
108                     opening the block comment with `/*`",
109                );
110                error
111            }
112            Some('`') if self.mode != SyntaxMode::Math => return self.raw(),
113            Some(c) => match self.mode {
114                SyntaxMode::Markup => self.markup(start, c),
115                SyntaxMode::Math => match self.math(start, c) {
116                    (kind, None) => kind,
117                    (kind, Some(node)) => return (kind, node),
118                },
119                SyntaxMode::Code => self.code(start, c),
120            },
121
122            None => SyntaxKind::End,
123        };
124
125        let text = self.s.from(start);
126        let node = match self.error.take() {
127            Some((message, hints)) => SyntaxNode::error(message, text).with_hints(hints),
128            None => SyntaxNode::leaf(kind, text),
129        };
130        (kind, node)
131    }
132
133    /// Eat whitespace characters greedily.
134    fn whitespace(&mut self, start: usize, c: char) -> SyntaxKind {
135        let more = self.s.eat_while(|c| is_space(c, self.mode));
136        let newlines = match c {
137            // Optimize eating a single space.
138            ' ' if more.is_empty() => 0,
139            _ => count_newlines(self.s.from(start)),
140        };
141
142        self.newline = newlines > 0;
143        if self.mode == SyntaxMode::Markup && newlines >= 2 {
144            SyntaxKind::Parbreak
145        } else {
146            SyntaxKind::Space
147        }
148    }
149
150    fn shebang(&mut self) -> SyntaxKind {
151        self.s.eat_until(is_newline);
152        SyntaxKind::Shebang
153    }
154
155    fn line_comment(&mut self) -> SyntaxKind {
156        self.s.eat_until(is_newline);
157        SyntaxKind::LineComment
158    }
159
160    fn block_comment(&mut self) -> SyntaxKind {
161        let mut state = '_';
162        let mut depth = 1;
163
164        // Find the first `*/` that does not correspond to a nested `/*`.
165        while let Some(c) = self.s.eat() {
166            state = match (state, c) {
167                ('*', '/') => {
168                    depth -= 1;
169                    if depth == 0 {
170                        break;
171                    }
172                    '_'
173                }
174                ('/', '*') => {
175                    depth += 1;
176                    '_'
177                }
178                _ => c,
179            }
180        }
181
182        SyntaxKind::BlockComment
183    }
184}
185
186/// Raw.
187impl Lexer<'_> {
188    /// We parse entire raw segments in the lexer as a convenience to avoid
189    /// going to and from the parser for each raw section.
190    fn raw(&mut self) -> (SyntaxKind, SyntaxNode) {
191        let start = self.s.cursor() - 1;
192
193        // Determine number of opening backticks.
194        let mut backticks = 1;
195        while self.s.eat_if('`') {
196            backticks += 1;
197        }
198
199        // Special case for ``.
200        if backticks == 2 {
201            let nodes = vec![
202                SyntaxNode::leaf(SyntaxKind::RawDelim, "`"),
203                SyntaxNode::leaf(SyntaxKind::RawDelim, "`"),
204            ];
205            return (SyntaxKind::Raw, SyntaxNode::inner(SyntaxKind::Raw, nodes));
206        }
207
208        // Find end of raw text.
209        let mut found = 0;
210        while found < backticks {
211            match self.s.eat() {
212                Some('`') => found += 1,
213                Some(_) => found = 0,
214                None => {
215                    let message = "unclosed raw text";
216                    let error = SyntaxNode::error(message, self.s.from(start));
217                    return (SyntaxKind::Error, error);
218                }
219            }
220        }
221        let end = self.s.cursor();
222
223        let mut inner = Scanner::new(self.s.get(start + backticks..end - backticks));
224        let inner_len = inner.string().len();
225
226        // Opening delimiter.
227        let delim = SyntaxNode::leaf(SyntaxKind::RawDelim, self.s.from(end - backticks));
228        let mut nodes = vec![delim.clone()];
229
230        let mut tag = None;
231        let mut diff_future_tag_len = None;
232        if delim.len() >= 3 {
233            (tag, diff_future_tag_len) = Self::raw_lang_tag(&mut inner);
234            if let Some(tag) = tag {
235                nodes.push(SyntaxNode::leaf(SyntaxKind::RawLang, tag));
236            }
237            Self::blocky_raw(&mut inner, &mut nodes);
238        } else {
239            Self::inline_raw(&mut inner, &mut nodes);
240        }
241
242        // Closing delimiter.
243        nodes.push(delim);
244
245        let mut raw = SyntaxNode::inner(SyntaxKind::Raw, nodes);
246
247        Self::add_raw_warnings(&mut raw, backticks, diff_future_tag_len, tag, inner_len);
248
249        (SyntaxKind::Raw, raw)
250    }
251
252    /// Lex the raw language tag into an optional string and return the length
253    /// of the future language tag if it will differ in the next version.
254    ///
255    /// See [`Self::add_raw_warnings`] for more.
256    fn raw_lang_tag<'a>(s: &mut Scanner<'a>) -> (Option<&'a str>, Option<usize>) {
257        let start = s.cursor();
258        let future_tag = s.eat_until(|c: char| c.is_whitespace() || c == '`');
259        if future_tag.is_empty() {
260            // Future tag is always longer than current tag, if empty, we have
261            // no current tag either.
262            return (None, None);
263        }
264        s.jump(start);
265        let tag = s.eat_if(is_id_start).then(|| {
266            s.eat_while(is_id_continue);
267            s.from(start)
268        });
269        let diff_future_tag_len = tag
270            .is_none_or(|tag| tag.len() != future_tag.len())
271            .then_some(future_tag.len());
272        (tag, diff_future_tag_len)
273    }
274
275    /// Raw blocks parse a language tag, have smart behavior for trimming
276    /// whitespace in the start/end lines, and trim common leading whitespace
277    /// from all other lines as the "dedent". The exact behavior is described
278    /// below.
279    ///
280    /// ### The initial line:
281    /// - The language tag is already handled above.
282    /// - We check the rest of the line and if all characters are whitespace,
283    ///   trim it. Otherwise we trim a single leading space if present.
284    ///   - If more trimmed characters follow on future lines, they will be
285    ///     merged into the same trimmed element.
286    /// - If we didn't trim the entire line, the rest is kept as text.
287    ///
288    /// ### Inner lines:
289    /// - We determine the "dedent" by iterating over the lines. The dedent is
290    ///   the minimum number of leading whitespace characters (not bytes) before
291    ///   each line that has any non-whitespace characters.
292    ///   - The opening delimiter's line does not contribute to the dedent, but
293    ///     the closing delimiter's line does (even if that line is entirely
294    ///     whitespace up to the delimiter).
295    /// - We then trim the newline and dedent characters of each line, and add a
296    ///   (potentially empty) text element of all remaining characters.
297    ///
298    /// ### The final line:
299    /// - If the last line is entirely whitespace, it is trimmed.
300    /// - Otherwise its text is kept like an inner line. However, if the last
301    ///   non-whitespace character of the final line is a backtick, then one
302    ///   ascii space (if present) is trimmed from the end.
303    fn blocky_raw(s: &mut Scanner, nodes: &mut Vec<SyntaxNode>) {
304        // The lines between the backticks.
305        let mut lines = split_newlines(s.after());
306
307        // Determine dedent level.
308        let dedent = lines
309            .iter()
310            .skip(1)
311            .filter(|line| !line.chars().all(char::is_whitespace))
312            // The line with the closing ``` is always taken into account
313            .chain(lines.last())
314            .map(|line| line.chars().take_while(|c| c.is_whitespace()).count())
315            .min()
316            .unwrap_or(0);
317
318        // Trim whitespace from the last line. Will be added as a `RawTrimmed`
319        // kind by the check for `self.s.cursor() != inner_end` below.
320        if lines.last().is_some_and(|last| last.chars().all(char::is_whitespace)) {
321            lines.pop();
322        } else if let Some(last) = lines.last_mut() {
323            // If last line ends in a backtick, try to trim a single space. This
324            // check must happen before we add the first line since the last and
325            // first lines might be the same.
326            if last.trim_end().ends_with('`') {
327                *last = last.strip_suffix(' ').unwrap_or(last);
328            }
329        }
330
331        // A closure for pushing a leaf node and updating the cursor in one
332        // step.
333        let mut prev = s.cursor();
334        let mut push_leaf = |kind, s: &Scanner| {
335            nodes.push(SyntaxNode::leaf(kind, s.from(prev)));
336            prev = s.cursor();
337        };
338
339        let mut lines = lines.into_iter();
340
341        // Handle the first line: trim if all whitespace, or trim a single space
342        // at the start. Note that the first line does not affect the dedent
343        // value.
344        if let Some(first_line) = lines.next() {
345            if first_line.chars().all(char::is_whitespace) {
346                s.advance(first_line.len());
347                // This is the only spot we advance the scanner, but don't
348                // immediately call `push_leaf`. But the rest of the function
349                // ensures we will always add this text to a `RawTrimmed` later.
350                debug_assert!(!s.done());
351                // A proof by cases follows:
352                // # First case: The loop runs
353                // If the loop runs, there must be a newline following, so
354                // `cursor != inner_end`. And if the loop runs, the first thing
355                // it does is add a trimmed element.
356                // # Second case: The final if-statement runs.
357                // To _not_ reach the loop from here, we must have only one or
358                // two lines:
359                // 1. If one line, we cannot be here, because the first and last
360                //    lines are the same, so this line will have been removed by
361                //    the check for the last line being all whitespace.
362                // 2. If two lines, the loop will run unless the last is fully
363                //    whitespace, but if it is, it will have been popped, then
364                //    the final if-statement will run because the text removed
365                //    by the last line must include at least a newline, so
366                //    `cursor != inner_end` here.
367            } else {
368                let line_end = s.cursor() + first_line.len();
369                if s.eat_if(' ') {
370                    // Trim a single space after the lang tag or backticks on
371                    // the first line.
372                    push_leaf(SyntaxKind::RawTrimmed, s);
373                }
374                // We know here that the rest of the line is non-empty.
375                s.jump(line_end);
376                push_leaf(SyntaxKind::Text, s);
377            }
378        }
379
380        // Add lines.
381        for line in lines {
382            let offset: usize = line.chars().take(dedent).map(char::len_utf8).sum();
383            s.eat_newline();
384            s.advance(offset);
385            push_leaf(SyntaxKind::RawTrimmed, s);
386            s.advance(line.len() - offset);
387            push_leaf(SyntaxKind::Text, s);
388        }
389
390        // Add final trimmed.
391        if !s.done() {
392            nodes.push(SyntaxNode::leaf(SyntaxKind::RawTrimmed, s.after()));
393        }
394    }
395
396    /// Inline raw text is split on lines with non-newlines as `Text` kinds and
397    /// newlines as `RawTrimmed`. Inline raw text does not dedent the text, all
398    /// non-newline whitespace is kept.
399    fn inline_raw(s: &mut Scanner, nodes: &mut Vec<SyntaxNode>) {
400        let mut prev = s.cursor();
401        while !s.done() {
402            if s.at(is_newline) {
403                nodes.push(SyntaxNode::leaf(SyntaxKind::Text, s.from(prev)));
404                prev = s.cursor();
405                s.eat_newline();
406                nodes.push(SyntaxNode::leaf(SyntaxKind::RawTrimmed, s.from(prev)));
407                prev = s.cursor();
408                continue;
409            }
410            s.eat();
411        }
412        nodes.push(SyntaxNode::leaf(SyntaxKind::Text, s.from(prev)));
413    }
414
415    /// Add warnings if the raw language tag will differ in the next version of
416    /// Typst or if there is a language tag but the raw text is empty.
417    ///
418    /// Currently, we parse the language tag only up to the end of a valid
419    /// identifier or the first whitespace. So if we start with `C++`, the
420    /// identifier `C` will be the language tag, and the raw text will start
421    /// with `++`. If we start with `++C`, we will have no language tag and the
422    /// raw text will start with `++C`.
423    ///
424    /// In the next version of Typst, we will parse all text up to the first
425    /// whitespace or backtick, so tags like `C++` or `html.j2` or `$!#%@` can
426    /// be written without issue
427    ///
428    /// However, this may cause some documents relying on the behavior of raw
429    /// text starting like `C++` or `++C` to change, so we are giving a warning
430    /// for those cases.
431    fn add_raw_warnings(
432        raw: &mut SyntaxNode,
433        backticks: usize,
434        diff_future_tag_len: Option<usize>,
435        tag: Option<&str>,
436        inner_len: usize,
437    ) {
438        // Add a warning if the tag will differ in the next version.
439        if let Some(future_tag_len) = diff_future_tag_len {
440            let future_range = backticks..backticks + future_tag_len;
441            if let Some(tag) = tag {
442                raw.warn_at(
443                    future_range,
444                    "no whitespace between language tag and raw text",
445                );
446                raw.hint(eco_format!(
447                    "currently, Typst is treating `{tag}` as the language tag"
448                ));
449                raw.hint(
450                    "in the next version of Typst, this will change and we will treat \
451                        all text until the first whitespace as the language tag",
452                );
453                let tag_range = backticks..backticks + tag.len();
454                raw.hint_at(tag_range.clone(), eco_format!(
455                    "if the current behavior is correct, please add a space after `{tag}`"
456                ));
457                raw.hint_at(
458                    tag_range,
459                    "otherwise, add a space or newline after the initial backticks",
460                );
461            } else {
462                raw.warn_at(future_range, "no whitespace before raw text");
463                raw.hint(
464                    "in the next version of Typst, this text will be treated as \
465                        the language tag for this element",
466                );
467                raw.hint("to avoid this, add a space after the initial backticks");
468            }
469        } else if let Some(tag) = tag
470            && inner_len == tag.len()
471        {
472            // Empty with no tag/ws is only possible with exactly two backticks,
473            // which is handled by the caller.
474            raw.warn("empty raw text");
475            raw.hint(eco_format!("Typst is treating `{tag}` as the language tag"));
476            let tag_range = backticks..backticks + tag.len();
477            raw.hint_at(
478                tag_range,
479                "to treat this as text, add a space after the initial backticks",
480            );
481        }
482    }
483}
484
485/// Markup.
486impl Lexer<'_> {
487    fn markup(&mut self, start: usize, c: char) -> SyntaxKind {
488        match c {
489            '\\' => self.backslash(),
490            'h' if self.s.eat_if("ttp://") => self.link(),
491            'h' if self.s.eat_if("ttps://") => self.link(),
492            '<' if self.s.at(is_id_continue) => self.label(),
493            '@' if self.s.at(is_id_continue) => self.ref_marker(),
494
495            '.' if self.s.eat_if("..") => SyntaxKind::Shorthand,
496            '-' if self.s.eat_if("--") => SyntaxKind::Shorthand,
497            '-' if self.s.eat_if('-') => SyntaxKind::Shorthand,
498            '-' if self.s.eat_if('?') => SyntaxKind::Shorthand,
499            '-' if self.s.at(char::is_numeric) => SyntaxKind::Shorthand,
500            '*' if !self.in_word() => SyntaxKind::Star,
501            '_' if !self.in_word() => SyntaxKind::Underscore,
502
503            '#' => SyntaxKind::Hash,
504            '[' => SyntaxKind::LeftBracket,
505            ']' => SyntaxKind::RightBracket,
506            '\'' => SyntaxKind::SmartQuote,
507            '"' => SyntaxKind::SmartQuote,
508            '$' => SyntaxKind::Dollar,
509            '~' => SyntaxKind::Shorthand,
510            ':' => SyntaxKind::Colon,
511            '=' => {
512                self.s.eat_while('=');
513                if self.space_or_end() { SyntaxKind::HeadingMarker } else { self.text() }
514            }
515            '-' if self.space_or_end() => SyntaxKind::ListMarker,
516            '+' if self.space_or_end() => SyntaxKind::EnumMarker,
517            '/' if self.space_or_end() => SyntaxKind::TermMarker,
518            '0'..='9' => self.numbering(start),
519
520            _ => self.text(),
521        }
522    }
523
524    fn backslash(&mut self) -> SyntaxKind {
525        if self.s.eat_if("u{") {
526            let hex = self.s.eat_while(char::is_ascii_alphanumeric);
527            if !self.s.eat_if('}') {
528                return self.error("unclosed Unicode escape sequence");
529            }
530
531            if u32::from_str_radix(hex, 16)
532                .ok()
533                .and_then(std::char::from_u32)
534                .is_none()
535            {
536                return self.error(eco_format!("invalid Unicode codepoint: {hex}"));
537            }
538
539            return SyntaxKind::Escape;
540        }
541
542        if self.s.done() || self.s.at(char::is_whitespace) {
543            SyntaxKind::Linebreak
544        } else {
545            self.s.eat();
546            SyntaxKind::Escape
547        }
548    }
549
550    fn link(&mut self) -> SyntaxKind {
551        let (link, balanced) = link_prefix(self.s.after());
552        self.s.advance(link.len());
553
554        if !balanced {
555            return self.error(
556                "automatic links cannot contain unbalanced brackets, \
557                 use the `link` function instead",
558            );
559        }
560
561        SyntaxKind::Link
562    }
563
564    fn numbering(&mut self, start: usize) -> SyntaxKind {
565        self.s.eat_while(char::is_ascii_digit);
566
567        let read = self.s.from(start);
568        if self.s.eat_if('.') && self.space_or_end() && read.parse::<u64>().is_ok() {
569            return SyntaxKind::EnumMarker;
570        }
571
572        self.text()
573    }
574
575    fn ref_marker(&mut self) -> SyntaxKind {
576        self.s.eat_while(is_valid_in_label_literal);
577
578        // Don't include the trailing characters likely to be part of text.
579        while matches!(self.s.scout(-1), Some('.' | ':')) {
580            self.s.uneat();
581        }
582
583        SyntaxKind::RefMarker
584    }
585
586    fn label(&mut self) -> SyntaxKind {
587        let label = self.s.eat_while(is_valid_in_label_literal);
588        if label.is_empty() {
589            return self.error("label cannot be empty");
590        }
591
592        if !self.s.eat_if('>') {
593            return self.error("unclosed label");
594        }
595
596        SyntaxKind::Label
597    }
598
599    fn text(&mut self) -> SyntaxKind {
600        macro_rules! table {
601            ($(|$c:literal)*) => {
602                static TABLE: [bool; 128] = {
603                    let mut t = [false; 128];
604                    $(t[$c as usize] = true;)*
605                    t
606                };
607            };
608        }
609
610        table! {
611            | ' ' | '\t' | '\n' | '\x0b' | '\x0c' | '\r' | '\\' | '/'
612            | '[' | ']' | '~' | '-' | '.' | '\'' | '"' | '*' | '_'
613            | ':' | 'h' | '`' | '$' | '<' | '>' | '@' | '#'
614        };
615
616        loop {
617            self.s.eat_until(|c: char| {
618                TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace())
619            });
620
621            // Continue with the same text node if the thing would become text
622            // anyway.
623            let mut s = self.s;
624            match s.eat() {
625                Some(' ') if s.at(char::is_alphanumeric) => {}
626                Some('/') if !s.at(['/', '*']) => {}
627                Some('-') if !s.at(['-', '?']) => {}
628                Some('.') if !s.at("..") => {}
629                Some('h') if !s.at("ttp://") && !s.at("ttps://") => {}
630                Some('@') if !s.at(is_valid_in_label_literal) => {}
631                _ => break,
632            }
633
634            self.s = s;
635        }
636
637        SyntaxKind::Text
638    }
639
640    fn in_word(&self) -> bool {
641        let wordy = |c: Option<char>| {
642            c.is_some_and(|c| {
643                c.is_alphanumeric()
644                    && !matches!(
645                        c.script(),
646                        Script::Han
647                            | Script::Hiragana
648                            | Script::Katakana
649                            | Script::Hangul
650                    )
651            })
652        };
653        let prev = self.s.scout(-2);
654        let next = self.s.peek();
655        wordy(prev) && wordy(next)
656    }
657
658    fn space_or_end(&self) -> bool {
659        self.s.done()
660            || self.s.at(char::is_whitespace)
661            || self.s.at("//")
662            || self.s.at("/*")
663    }
664}
665
666/// Math.
667impl Lexer<'_> {
668    fn math(&mut self, start: usize, c: char) -> (SyntaxKind, Option<SyntaxNode>) {
669        let kind = match c {
670            '\\' => self.backslash(),
671            '"' => self.string(),
672
673            '-' if self.s.eat_if(">>") => SyntaxKind::MathShorthand,
674            '-' if self.s.eat_if('>') => SyntaxKind::MathShorthand,
675            '-' if self.s.eat_if("->") => SyntaxKind::MathShorthand,
676            ':' if self.s.eat_if('=') => SyntaxKind::MathShorthand,
677            ':' if self.s.eat_if(":=") => SyntaxKind::MathShorthand,
678            '!' if self.s.eat_if('=') => SyntaxKind::MathShorthand,
679            '.' if self.s.eat_if("..") => SyntaxKind::MathShorthand,
680            '<' if self.s.eat_if("==>") => SyntaxKind::MathShorthand,
681            '<' if self.s.eat_if("-->") => SyntaxKind::MathShorthand,
682            '<' if self.s.eat_if("--") => SyntaxKind::MathShorthand,
683            '<' if self.s.eat_if("-<") => SyntaxKind::MathShorthand,
684            '<' if self.s.eat_if("->") => SyntaxKind::MathShorthand,
685            '<' if self.s.eat_if("<-") => SyntaxKind::MathShorthand,
686            '<' if self.s.eat_if("<<") => SyntaxKind::MathShorthand,
687            '<' if self.s.eat_if("=>") => SyntaxKind::MathShorthand,
688            '<' if self.s.eat_if("==") => SyntaxKind::MathShorthand,
689            '<' if self.s.eat_if("~~") => SyntaxKind::MathShorthand,
690            '<' if self.s.eat_if('=') => SyntaxKind::MathShorthand,
691            '<' if self.s.eat_if('<') => SyntaxKind::MathShorthand,
692            '<' if self.s.eat_if('-') => SyntaxKind::MathShorthand,
693            '<' if self.s.eat_if('~') => SyntaxKind::MathShorthand,
694            '>' if self.s.eat_if("->") => SyntaxKind::MathShorthand,
695            '>' if self.s.eat_if(">>") => SyntaxKind::MathShorthand,
696            '=' if self.s.eat_if("=>") => SyntaxKind::MathShorthand,
697            '=' if self.s.eat_if('>') => SyntaxKind::MathShorthand,
698            '=' if self.s.eat_if(':') => SyntaxKind::MathShorthand,
699            '>' if self.s.eat_if('=') => SyntaxKind::MathShorthand,
700            '>' if self.s.eat_if('>') => SyntaxKind::MathShorthand,
701            '|' if self.s.eat_if("->") => SyntaxKind::MathShorthand,
702            '|' if self.s.eat_if("=>") => SyntaxKind::MathShorthand,
703            '|' if self.s.eat_if('|') => SyntaxKind::MathShorthand,
704            '~' if self.s.eat_if("~>") => SyntaxKind::MathShorthand,
705            '~' if self.s.eat_if('>') => SyntaxKind::MathShorthand,
706            '*' | '-' | '~' => SyntaxKind::MathShorthand,
707
708            '.' => SyntaxKind::Dot,
709            ',' => SyntaxKind::Comma,
710            ';' => SyntaxKind::Semicolon,
711
712            '#' => SyntaxKind::Hash,
713            '_' => SyntaxKind::Underscore,
714            '$' => SyntaxKind::Dollar,
715            '/' => SyntaxKind::Slash,
716            '^' => SyntaxKind::Hat,
717            '&' => SyntaxKind::MathAlignPoint,
718            '√' | '∛' | '∜' => SyntaxKind::Root,
719            '!' => SyntaxKind::Bang,
720
721            '\'' => {
722                self.s.eat_while('\'');
723                SyntaxKind::MathPrimes
724            }
725
726            // We lex delimiters as `{Left,Right}{Brace,Paren}` and convert back
727            // to `MathText` or `MathShorthand` in the parser.
728            '(' => SyntaxKind::LeftParen,
729            ')' => SyntaxKind::RightParen,
730            // TODO: We may instead want to add `MathOpening` and `MathClosing`
731            // kinds for these.
732            '[' if self.s.eat_if('|') => SyntaxKind::LeftBrace,
733            '|' if self.s.eat_if(']') => SyntaxKind::RightBrace,
734            c if default_math_class(c) == Some(MathClass::Opening) => {
735                SyntaxKind::LeftBrace
736            }
737            c if default_math_class(c) == Some(MathClass::Closing) => {
738                SyntaxKind::RightBrace
739            }
740
741            // Identifiers.
742            c if is_math_id_start(c) && self.s.at(is_math_id_continue) => {
743                self.s.eat_while(is_math_id_continue);
744                let (last_index, _) =
745                    self.s.from(start).grapheme_indices(true).next_back().unwrap();
746                if last_index == 0 {
747                    // If this was just a single grapheme.
748                    SyntaxKind::MathText
749                } else {
750                    let (kind, node) = self.math_ident_or_field(start);
751                    return (kind, Some(node));
752                }
753            }
754
755            // Other math atoms.
756            _ => self.math_text(start, c),
757        };
758        (kind, None)
759    }
760
761    /// Parse a single `MathIdent` or an entire `MathFieldAccess`.
762    fn math_ident_or_field(&mut self, start: usize) -> (SyntaxKind, SyntaxNode) {
763        let mut kind = SyntaxKind::MathIdent;
764        let mut node = SyntaxNode::leaf(kind, self.s.from(start));
765        while let Some(ident) = self.maybe_dot_ident() {
766            kind = SyntaxKind::MathFieldAccess;
767            let field_children = vec![
768                node,
769                SyntaxNode::leaf(SyntaxKind::Dot, '.'),
770                SyntaxNode::leaf(SyntaxKind::MathIdent, ident),
771            ];
772            node = SyntaxNode::inner(kind, field_children);
773        }
774        (kind, node)
775    }
776
777    /// If at a dot and a math identifier, eat and return the identifier.
778    fn maybe_dot_ident(&mut self) -> Option<&str> {
779        if self.s.scout(1).is_some_and(is_math_id_start) && self.s.eat_if('.') {
780            let ident_start = self.s.cursor();
781            self.s.eat();
782            self.s.eat_while(is_math_id_continue);
783            Some(self.s.from(ident_start))
784        } else {
785            None
786        }
787    }
788
789    fn math_text(&mut self, start: usize, c: char) -> SyntaxKind {
790        // Keep numbers and grapheme clusters together.
791        if c.is_numeric() {
792            self.s.eat_while(char::is_numeric);
793            let mut s = self.s;
794            if s.eat_if('.') && !s.eat_while(char::is_numeric).is_empty() {
795                self.s = s;
796            }
797        } else {
798            let len = self
799                .s
800                .get(start..self.s.string().len())
801                .graphemes(true)
802                .next()
803                .map_or(0, str::len);
804            self.s.jump(start + len);
805        }
806        SyntaxKind::MathText
807    }
808
809    /// Handle named arguments in math function call.
810    pub fn maybe_math_named_arg(&mut self, start: usize) -> Option<SyntaxNode> {
811        let cursor = self.s.cursor();
812        self.s.jump(start);
813        if self.s.eat_if(is_id_start) {
814            self.s.eat_while(is_id_continue);
815            // Check that a colon directly follows the identifier, and not the
816            // `:=` or `::=` math shorthands.
817            if self.s.at(':') && !self.s.at(":=") && !self.s.at("::=") {
818                // Check that the identifier is not just `_`.
819                let node = if self.s.from(start) != "_" {
820                    SyntaxNode::leaf(SyntaxKind::Ident, self.s.from(start))
821                } else {
822                    let message = "expected identifier, found underscore";
823                    SyntaxNode::error(message, self.s.from(start))
824                };
825                return Some(node);
826            }
827        }
828        self.s.jump(cursor);
829        None
830    }
831
832    /// Handle spread arguments in math function call.
833    pub fn maybe_math_spread_arg(&mut self, start: usize) -> Option<SyntaxNode> {
834        let cursor = self.s.cursor();
835        self.s.jump(start);
836        if self.s.eat_if("..") {
837            // We only infer a spread operator if it is not followed by:
838            // - a space/trivia/end
839            // - a dot (this would clash with the `...` math shorthand)
840            // - an end of arg character: `,`, `;`, ')', `$` (spreads nothing)
841            if !self.space_or_end() && !self.s.at(['.', ',', ';', ')', '$']) {
842                let node = SyntaxNode::leaf(SyntaxKind::Dots, self.s.from(start));
843                return Some(node);
844            }
845        }
846        self.s.jump(cursor);
847        None
848    }
849}
850
851/// Code.
852impl Lexer<'_> {
853    fn code(&mut self, start: usize, c: char) -> SyntaxKind {
854        match c {
855            '<' if self.s.at(is_id_continue) => self.label(),
856            '0'..='9' => self.number(start, c),
857            '.' if self.s.at(char::is_ascii_digit) => self.number(start, c),
858            '"' => self.string(),
859
860            '=' if self.s.eat_if('=') => SyntaxKind::EqEq,
861            '!' if self.s.eat_if('=') => SyntaxKind::ExclEq,
862            '<' if self.s.eat_if('=') => SyntaxKind::LtEq,
863            '>' if self.s.eat_if('=') => SyntaxKind::GtEq,
864            '+' if self.s.eat_if('=') => SyntaxKind::PlusEq,
865            '-' | '\u{2212}' if self.s.eat_if('=') => SyntaxKind::HyphEq,
866            '*' if self.s.eat_if('=') => SyntaxKind::StarEq,
867            '/' if self.s.eat_if('=') => SyntaxKind::SlashEq,
868            '.' if self.s.eat_if('.') => SyntaxKind::Dots,
869            '=' if self.s.eat_if('>') => SyntaxKind::Arrow,
870
871            '{' => SyntaxKind::LeftBrace,
872            '}' => SyntaxKind::RightBrace,
873            '[' => SyntaxKind::LeftBracket,
874            ']' => SyntaxKind::RightBracket,
875            '(' => SyntaxKind::LeftParen,
876            ')' => SyntaxKind::RightParen,
877            '$' => SyntaxKind::Dollar,
878            ',' => SyntaxKind::Comma,
879            ';' => SyntaxKind::Semicolon,
880            ':' => SyntaxKind::Colon,
881            '.' => SyntaxKind::Dot,
882            '+' => SyntaxKind::Plus,
883            '-' | '\u{2212}' => SyntaxKind::Minus,
884            '*' => SyntaxKind::Star,
885            '/' => SyntaxKind::Slash,
886            '=' => SyntaxKind::Eq,
887            '<' => SyntaxKind::Lt,
888            '>' => SyntaxKind::Gt,
889
890            c if is_id_start(c) => self.ident(start),
891
892            c => self.invalid_char_in_code(c),
893        }
894    }
895
896    /// Error for an invalid character in code, but try to give good hints for
897    /// commonly confusing operators.
898    fn invalid_char_in_code(&mut self, c: char) -> SyntaxKind {
899        let invalid_char = || eco_format!("the character `{c}` is not valid in code");
900        let invalid_str = |s: &str| eco_format!("`{s}` is not valid in code");
901        match c {
902            // Give a custom hint if we immediately follow a hash.
903            _ if self.s.scout(-2) == Some('#') => {
904                self.error(invalid_char());
905                // This is only an accurate hint if we just came from markup or
906                // math, but `#!` or `##` in code should be rare enough that
907                // it's fine (and the first hash will produce its own error).
908                self.hint("the preceding hash is causing this to parse in code mode");
909                self.hint("try escaping the preceding hash: `\\#`");
910                // The span for these hints isn't great, but it's hard to fix.
911            }
912            '#' => {
913                self.error(invalid_char());
914                self.hint("you are already in code mode");
915                self.hint("try removing the `#`");
916            }
917            '&' if self.s.eat_if('&') => {
918                self.error(invalid_str("&&"));
919                self.hint("in Typst, `and` is used for logical AND");
920            }
921            '|' if self.s.eat_if('|') => {
922                self.error(invalid_str("||"));
923                self.hint("in Typst, `or` is used for logical OR");
924            }
925            '!' => {
926                self.error(invalid_char());
927                self.hint("in Typst, `not` is used for negation");
928                self.hint("or did you mean to write `!=` for not-equal?");
929            }
930            '~' if self.s.eat_if('=') => {
931                self.error(invalid_str("~="));
932                self.hint("in Typst, `!=` is used for not-equal");
933            }
934            _ => {
935                self.error(invalid_char());
936            }
937        }
938        SyntaxKind::Error
939    }
940
941    fn ident(&mut self, start: usize) -> SyntaxKind {
942        self.s.eat_while(is_id_continue);
943        let ident = self.s.from(start);
944
945        let prev = self.s.get(0..start);
946        if (!prev.ends_with(['.', '@']) || prev.ends_with(".."))
947            && let Some(keyword) = keyword(ident)
948        {
949            return keyword;
950        }
951
952        if ident == "_" { SyntaxKind::Underscore } else { SyntaxKind::Ident }
953    }
954
955    /// Lex a single number, either an integer or a float, possibly with a
956    /// numeric suffix (`pt`, `deg`, `%`, etc.). Integers may also have a prefix
957    /// for binary, octal, or hexadecimal bases, but only base-10 integers can
958    /// have a numeric suffix (if so, they will be treated as floats in the
959    /// AST).
960    ///
961    /// Floating point numbers can use exponent notation with `e` or `E`, such
962    /// as `5.0e-3in` for `0.005in`.
963    fn number(&mut self, start: usize, first_c: char) -> SyntaxKind {
964        // Handle alternative integer bases.
965        let base = match first_c {
966            '0' if self.s.eat_if('b') => 2,
967            '0' if self.s.eat_if('o') => 8,
968            '0' if self.s.eat_if('x') => 16,
969            _ => 10,
970        };
971
972        // Read the initial digits.
973        if base == 16 {
974            self.s.eat_while(char::is_ascii_alphanumeric);
975        } else {
976            self.s.eat_while(char::is_ascii_digit);
977        }
978
979        // Read floating point digits and exponents.
980        let mut is_float = false;
981        if base == 10 {
982            // Read digits following a dot. Make sure not to confuse a spread
983            // operator or a method call for the decimal separator.
984            if first_c == '.' {
985                is_float = true; // We already ate the trailing digits above.
986            } else if !self.s.at("..")
987                && !self.s.scout(1).is_some_and(is_id_start)
988                && self.s.eat_if('.')
989            {
990                is_float = true;
991                self.s.eat_while(char::is_ascii_digit);
992            }
993
994            // Read the exponent.
995            if !self.s.at("em") && self.s.eat_if(['e', 'E']) {
996                is_float = true;
997                self.s.eat_if(['+', '-']);
998                self.s.eat_while(char::is_ascii_digit);
999            }
1000        }
1001
1002        let number = self.s.from(start);
1003        let suffix = self.s.eat_while(|c: char| c.is_ascii_alphanumeric() || c == '%');
1004
1005        // Parse large integer literals as floats
1006        if base == 10
1007            && !is_float
1008            && let Err(e) = i64::from_str_radix(number, base)
1009            && matches!(e.kind(), IntErrorKind::PosOverflow | IntErrorKind::NegOverflow)
1010            && number.parse::<f64>().is_ok()
1011        {
1012            is_float = true;
1013        }
1014
1015        let mut suffix_result = match suffix {
1016            "" => Ok(None),
1017            "pt" | "mm" | "cm" | "in" | "deg" | "rad" | "em" | "fr" | "%" => Ok(Some(())),
1018            _ => Err(eco_format!("invalid number suffix: `{suffix}`")),
1019        };
1020
1021        let number_result = if is_float && number.parse::<f64>().is_err() {
1022            // The only invalid case should be when a float lacks digits after
1023            // the exponent: e.g. `1.2e`, `2.3E-`, or `1EM`.
1024            Err(eco_format!("invalid floating point number: `{number}`"))
1025        } else if base == 10 {
1026            Ok(())
1027        } else {
1028            let name = match base {
1029                2 => "binary",
1030                8 => "octal",
1031                16 => "hexadecimal",
1032                _ => unreachable!(),
1033            };
1034            // The index `[2..]` skips the leading `0b`/`0o`/`0x`.
1035            match i64::from_str_radix(&number[2..], base) {
1036                Ok(_) if suffix.is_empty() => Ok(()),
1037                Ok(value) => {
1038                    if suffix_result.is_ok() {
1039                        suffix_result = Err(eco_format!(
1040                            "try using a decimal number: `{value}{suffix}`"
1041                        ));
1042                    }
1043                    Err(eco_format!("{name} numbers cannot have a suffix"))
1044                }
1045                Err(e) if *e.kind() == IntErrorKind::Empty => Err(eco_format!(
1046                    "expected a{} {name} number",
1047                    if base == 8 { "n" } else { "" },
1048                )),
1049                Err(_) => Err(eco_format!("invalid {name} number: `{number}`")),
1050            }
1051        };
1052
1053        // Return our number or write an error with helpful hints.
1054        match (number_result, suffix_result) {
1055            // Valid numbers :D
1056            (Ok(()), Ok(None)) if is_float => SyntaxKind::Float,
1057            (Ok(()), Ok(None)) => SyntaxKind::Int,
1058            (Ok(()), Ok(Some(()))) => SyntaxKind::Numeric,
1059            // Invalid numbers :(
1060            (Err(number_err), Err(suffix_err)) => {
1061                let error = self.error(number_err);
1062                self.hint(suffix_err);
1063                error
1064            }
1065            (Ok(()), Err(msg)) | (Err(msg), Ok(_)) => self.error(msg),
1066        }
1067    }
1068
1069    fn string(&mut self) -> SyntaxKind {
1070        let mut escaped = false;
1071        self.s.eat_until(|c| {
1072            let stop = c == '"' && !escaped;
1073            escaped = c == '\\' && !escaped;
1074            stop
1075        });
1076
1077        if !self.s.eat_if('"') {
1078            return self.error("unclosed string");
1079        }
1080
1081        SyntaxKind::Str
1082    }
1083}
1084
1085/// Try to parse an identifier into a keyword.
1086fn keyword(ident: &str) -> Option<SyntaxKind> {
1087    Some(match ident {
1088        "none" => SyntaxKind::None,
1089        "auto" => SyntaxKind::Auto,
1090        "true" => SyntaxKind::Bool,
1091        "false" => SyntaxKind::Bool,
1092        "not" => SyntaxKind::Not,
1093        "and" => SyntaxKind::And,
1094        "or" => SyntaxKind::Or,
1095        "let" => SyntaxKind::Let,
1096        "set" => SyntaxKind::Set,
1097        "show" => SyntaxKind::Show,
1098        "context" => SyntaxKind::Context,
1099        "if" => SyntaxKind::If,
1100        "else" => SyntaxKind::Else,
1101        "for" => SyntaxKind::For,
1102        "in" => SyntaxKind::In,
1103        "while" => SyntaxKind::While,
1104        "break" => SyntaxKind::Break,
1105        "continue" => SyntaxKind::Continue,
1106        "return" => SyntaxKind::Return,
1107        "import" => SyntaxKind::Import,
1108        "include" => SyntaxKind::Include,
1109        "as" => SyntaxKind::As,
1110        _ => return None,
1111    })
1112}
1113
1114trait ScannerExt {
1115    fn advance(&mut self, by: usize);
1116    fn eat_newline(&mut self) -> bool;
1117}
1118
1119impl ScannerExt for Scanner<'_> {
1120    fn advance(&mut self, by: usize) {
1121        self.jump(self.cursor() + by);
1122    }
1123
1124    fn eat_newline(&mut self) -> bool {
1125        let ate = self.eat_if(is_newline);
1126        if ate && self.before().ends_with('\r') {
1127            self.eat_if('\n');
1128        }
1129        ate
1130    }
1131}
1132
1133/// Whether a character will become a [`SyntaxKind::Space`] token.
1134#[inline]
1135fn is_space(character: char, mode: SyntaxMode) -> bool {
1136    match mode {
1137        SyntaxMode::Markup => matches!(character, ' ' | '\t') || is_newline(character),
1138        _ => character.is_whitespace(),
1139    }
1140}
1141
1142/// Whether a character is interpreted as a newline by Typst.
1143#[inline]
1144pub fn is_newline(character: char) -> bool {
1145    matches!(
1146        character,
1147        // Line Feed, Vertical Tab, Form Feed, Carriage Return.
1148        '\n' | '\x0B' | '\x0C' | '\r' |
1149        // Next Line, Line Separator, Paragraph Separator.
1150        '\u{0085}' | '\u{2028}' | '\u{2029}'
1151    )
1152}
1153
1154/// Extracts a prefix of the text that is a link and also returns whether the
1155/// parentheses and brackets in the link were balanced.
1156pub fn link_prefix(text: &str) -> (&str, bool) {
1157    let mut s = unscanny::Scanner::new(text);
1158    let mut brackets = Vec::new();
1159
1160    #[rustfmt::skip]
1161    s.eat_while(|c: char| {
1162        match c {
1163            | '0' ..= '9'
1164            | 'a' ..= 'z'
1165            | 'A' ..= 'Z'
1166            | '!' | '#' | '$' | '%' | '&' | '*' | '+'
1167            | ',' | '-' | '.' | '/' | ':' | ';' | '='
1168            | '?' | '@' | '_' | '~' | '\'' => true,
1169            '[' => {
1170                brackets.push(b'[');
1171                true
1172            }
1173            '(' => {
1174                brackets.push(b'(');
1175                true
1176            }
1177            ']' => brackets.pop() == Some(b'['),
1178            ')' => brackets.pop() == Some(b'('),
1179            _ => false,
1180        }
1181    });
1182
1183    // Don't include the trailing characters likely to be part of text.
1184    while matches!(s.scout(-1), Some('!' | ',' | '.' | ':' | ';' | '?' | '\'')) {
1185        s.uneat();
1186    }
1187
1188    (s.before(), brackets.is_empty())
1189}
1190
1191/// Split text at newlines. These newline characters are not kept.
1192pub fn split_newlines(text: &str) -> Vec<&str> {
1193    let mut s = Scanner::new(text);
1194    let mut lines = Vec::new();
1195    let mut start = 0;
1196    let mut end = 0;
1197
1198    while let Some(c) = s.eat() {
1199        if is_newline(c) {
1200            if c == '\r' {
1201                s.eat_if('\n');
1202            }
1203
1204            lines.push(&text[start..end]);
1205            start = s.cursor();
1206        }
1207        end = s.cursor();
1208    }
1209
1210    lines.push(&text[start..]);
1211    lines
1212}
1213
1214/// Count the number of newlines in text.
1215fn count_newlines(text: &str) -> usize {
1216    let mut newlines = 0;
1217    let mut s = Scanner::new(text);
1218    while let Some(c) = s.eat() {
1219        if is_newline(c) {
1220            if c == '\r' {
1221                s.eat_if('\n');
1222            }
1223            newlines += 1;
1224        }
1225    }
1226    newlines
1227}
1228
1229/// Whether a string is a valid Typst identifier.
1230///
1231/// In addition to what is specified in the [Unicode Standard][uax31], we allow:
1232/// - `_` as a starting character,
1233/// - `_` and `-` as continuing characters.
1234///
1235/// [uax31]: http://www.unicode.org/reports/tr31/
1236#[inline]
1237pub fn is_ident(string: &str) -> bool {
1238    let mut chars = string.chars();
1239    chars
1240        .next()
1241        .is_some_and(|c| is_id_start(c) && chars.all(is_id_continue))
1242}
1243
1244/// Whether a character can start an identifier.
1245#[inline]
1246pub fn is_id_start(c: char) -> bool {
1247    is_xid_start(c) || c == '_'
1248}
1249
1250/// Whether a character can continue an identifier.
1251#[inline]
1252pub fn is_id_continue(c: char) -> bool {
1253    is_xid_continue(c) || c == '_' || c == '-'
1254}
1255
1256/// Whether a character can start an identifier in math.
1257#[inline]
1258fn is_math_id_start(c: char) -> bool {
1259    is_xid_start(c)
1260}
1261
1262/// Whether a character can continue an identifier in math.
1263#[inline]
1264fn is_math_id_continue(c: char) -> bool {
1265    is_xid_continue(c) && c != '_'
1266}
1267
1268/// Whether a character can be part of a label literal's name.
1269#[inline]
1270fn is_valid_in_label_literal(c: char) -> bool {
1271    is_id_continue(c) || matches!(c, ':' | '.')
1272}
1273
1274/// Returns true if this string is valid in a label literal.
1275pub fn is_valid_label_literal_id(id: &str) -> bool {
1276    !id.is_empty() && id.chars().all(is_valid_in_label_literal)
1277}