typst_syntax/
lexer.rs

1use std::num::IntErrorKind;
2
3use ecow::{EcoString, eco_format};
4use unicode_ident::{is_xid_continue, is_xid_start};
5use unicode_script::{Script, UnicodeScript};
6use unicode_segmentation::UnicodeSegmentation;
7use unscanny::Scanner;
8
9use crate::{SyntaxError, SyntaxKind, SyntaxMode, SyntaxNode};
10
11/// An iterator over a source code string which returns tokens.
12#[derive(Clone)]
13pub(super) struct Lexer<'s> {
14    /// The scanner: contains the underlying string and location as a "cursor".
15    s: Scanner<'s>,
16    /// The mode the lexer is in. This determines which kinds of tokens it
17    /// produces.
18    mode: SyntaxMode,
19    /// Whether the last token contained a newline.
20    newline: bool,
21    /// An error for the last token.
22    error: Option<SyntaxError>,
23}
24
25impl<'s> Lexer<'s> {
26    /// Create a new lexer with the given mode and a prefix to offset column
27    /// calculations.
28    pub fn new(text: &'s str, mode: SyntaxMode) -> Self {
29        Self {
30            s: Scanner::new(text),
31            mode,
32            newline: false,
33            error: None,
34        }
35    }
36
37    /// Get the current lexing mode.
38    pub fn mode(&self) -> SyntaxMode {
39        self.mode
40    }
41
42    /// Change the lexing mode.
43    pub fn set_mode(&mut self, mode: SyntaxMode) {
44        self.mode = mode;
45    }
46
47    /// The index in the string at which the last token ends and next token
48    /// will start.
49    pub fn cursor(&self) -> usize {
50        self.s.cursor()
51    }
52
53    /// Jump to the given index in the string.
54    pub fn jump(&mut self, index: usize) {
55        self.s.jump(index);
56    }
57
58    /// Whether the last token contained a newline.
59    pub fn newline(&self) -> bool {
60        self.newline
61    }
62
63    /// The number of characters until the most recent newline from an index.
64    pub fn column(&self, index: usize) -> usize {
65        let mut s = self.s; // Make a new temporary scanner (cheap).
66        s.jump(index);
67        s.before().chars().rev().take_while(|&c| !is_newline(c)).count()
68    }
69}
70
71impl Lexer<'_> {
72    /// Construct a full-positioned syntax error.
73    fn error(&mut self, message: impl Into<EcoString>) -> SyntaxKind {
74        self.error = Some(SyntaxError::new(message));
75        SyntaxKind::Error
76    }
77
78    /// If the current node is an error, adds a hint.
79    fn hint(&mut self, message: impl Into<EcoString>) {
80        if let Some(error) = &mut self.error {
81            error.hints.push(message.into());
82        }
83    }
84}
85
86/// Shared methods with all [`SyntaxMode`].
87impl Lexer<'_> {
88    /// Return the next token in our text. Returns both the [`SyntaxNode`]
89    /// and the raw [`SyntaxKind`] to make it more ergonomic to check the kind
90    pub fn next(&mut self) -> (SyntaxKind, SyntaxNode) {
91        debug_assert!(self.error.is_none());
92        let start = self.s.cursor();
93
94        self.newline = false;
95        let kind = match self.s.eat() {
96            Some(c) if is_space(c, self.mode) => self.whitespace(start, c),
97            Some('#') if start == 0 && self.s.eat_if('!') => self.shebang(),
98            Some('/') if self.s.eat_if('/') => self.line_comment(),
99            Some('/') if self.s.eat_if('*') => self.block_comment(),
100            Some('*') if self.s.eat_if('/') => {
101                let kind = self.error("unexpected end of block comment");
102                self.hint(
103                    "consider escaping the `*` with a backslash or \
104                     opening the block comment with `/*`",
105                );
106                kind
107            }
108            Some('`') if self.mode != SyntaxMode::Math => return self.raw(),
109            Some(c) => match self.mode {
110                SyntaxMode::Markup => self.markup(start, c),
111                SyntaxMode::Math => match self.math(start, c) {
112                    (kind, None) => kind,
113                    (kind, Some(node)) => return (kind, node),
114                },
115                SyntaxMode::Code => self.code(start, c),
116            },
117
118            None => SyntaxKind::End,
119        };
120
121        let text = self.s.from(start);
122        let node = match self.error.take() {
123            Some(error) => SyntaxNode::error(error, text),
124            None => SyntaxNode::leaf(kind, text),
125        };
126        (kind, node)
127    }
128
129    /// Eat whitespace characters greedily.
130    fn whitespace(&mut self, start: usize, c: char) -> SyntaxKind {
131        let more = self.s.eat_while(|c| is_space(c, self.mode));
132        let newlines = match c {
133            // Optimize eating a single space.
134            ' ' if more.is_empty() => 0,
135            _ => count_newlines(self.s.from(start)),
136        };
137
138        self.newline = newlines > 0;
139        if self.mode == SyntaxMode::Markup && newlines >= 2 {
140            SyntaxKind::Parbreak
141        } else {
142            SyntaxKind::Space
143        }
144    }
145
146    fn shebang(&mut self) -> SyntaxKind {
147        self.s.eat_until(is_newline);
148        SyntaxKind::Shebang
149    }
150
151    fn line_comment(&mut self) -> SyntaxKind {
152        self.s.eat_until(is_newline);
153        SyntaxKind::LineComment
154    }
155
156    fn block_comment(&mut self) -> SyntaxKind {
157        let mut state = '_';
158        let mut depth = 1;
159
160        // Find the first `*/` that does not correspond to a nested `/*`.
161        while let Some(c) = self.s.eat() {
162            state = match (state, c) {
163                ('*', '/') => {
164                    depth -= 1;
165                    if depth == 0 {
166                        break;
167                    }
168                    '_'
169                }
170                ('/', '*') => {
171                    depth += 1;
172                    '_'
173                }
174                _ => c,
175            }
176        }
177
178        SyntaxKind::BlockComment
179    }
180}
181
182/// Markup.
183impl Lexer<'_> {
184    fn markup(&mut self, start: usize, c: char) -> SyntaxKind {
185        match c {
186            '\\' => self.backslash(),
187            'h' if self.s.eat_if("ttp://") => self.link(),
188            'h' if self.s.eat_if("ttps://") => self.link(),
189            '<' if self.s.at(is_id_continue) => self.label(),
190            '@' if self.s.at(is_id_continue) => self.ref_marker(),
191
192            '.' if self.s.eat_if("..") => SyntaxKind::Shorthand,
193            '-' if self.s.eat_if("--") => SyntaxKind::Shorthand,
194            '-' if self.s.eat_if('-') => SyntaxKind::Shorthand,
195            '-' if self.s.eat_if('?') => SyntaxKind::Shorthand,
196            '-' if self.s.at(char::is_numeric) => SyntaxKind::Shorthand,
197            '*' if !self.in_word() => SyntaxKind::Star,
198            '_' if !self.in_word() => SyntaxKind::Underscore,
199
200            '#' => SyntaxKind::Hash,
201            '[' => SyntaxKind::LeftBracket,
202            ']' => SyntaxKind::RightBracket,
203            '\'' => SyntaxKind::SmartQuote,
204            '"' => SyntaxKind::SmartQuote,
205            '$' => SyntaxKind::Dollar,
206            '~' => SyntaxKind::Shorthand,
207            ':' => SyntaxKind::Colon,
208            '=' => {
209                self.s.eat_while('=');
210                if self.space_or_end() { SyntaxKind::HeadingMarker } else { self.text() }
211            }
212            '-' if self.space_or_end() => SyntaxKind::ListMarker,
213            '+' if self.space_or_end() => SyntaxKind::EnumMarker,
214            '/' if self.space_or_end() => SyntaxKind::TermMarker,
215            '0'..='9' => self.numbering(start),
216
217            _ => self.text(),
218        }
219    }
220
221    fn backslash(&mut self) -> SyntaxKind {
222        if self.s.eat_if("u{") {
223            let hex = self.s.eat_while(char::is_ascii_alphanumeric);
224            if !self.s.eat_if('}') {
225                return self.error("unclosed Unicode escape sequence");
226            }
227
228            if u32::from_str_radix(hex, 16)
229                .ok()
230                .and_then(std::char::from_u32)
231                .is_none()
232            {
233                return self.error(eco_format!("invalid Unicode codepoint: {}", hex));
234            }
235
236            return SyntaxKind::Escape;
237        }
238
239        if self.s.done() || self.s.at(char::is_whitespace) {
240            SyntaxKind::Linebreak
241        } else {
242            self.s.eat();
243            SyntaxKind::Escape
244        }
245    }
246
247    /// We parse entire raw segments in the lexer as a convenience to avoid
248    /// going to and from the parser for each raw section. See comments in
249    /// [`Self::blocky_raw`] and [`Self::inline_raw`] for specific details.
250    fn raw(&mut self) -> (SyntaxKind, SyntaxNode) {
251        let start = self.s.cursor() - 1;
252
253        // Determine number of opening backticks.
254        let mut backticks = 1;
255        while self.s.eat_if('`') {
256            backticks += 1;
257        }
258
259        // Special case for ``.
260        if backticks == 2 {
261            let nodes = vec![
262                SyntaxNode::leaf(SyntaxKind::RawDelim, "`"),
263                SyntaxNode::leaf(SyntaxKind::RawDelim, "`"),
264            ];
265            return (SyntaxKind::Raw, SyntaxNode::inner(SyntaxKind::Raw, nodes));
266        }
267
268        // Find end of raw text.
269        let mut found = 0;
270        while found < backticks {
271            match self.s.eat() {
272                Some('`') => found += 1,
273                Some(_) => found = 0,
274                None => {
275                    let msg = SyntaxError::new("unclosed raw text");
276                    let error = SyntaxNode::error(msg, self.s.from(start));
277                    return (SyntaxKind::Error, error);
278                }
279            }
280        }
281        let end = self.s.cursor();
282
283        let mut nodes = Vec::with_capacity(3); // Will have at least 3.
284
285        // A closure for pushing a node onto our raw vector. Assumes the caller
286        // will move the scanner to the next location at each step.
287        let mut prev_start = start;
288        let mut push_raw = |kind, s: &Scanner| {
289            nodes.push(SyntaxNode::leaf(kind, s.from(prev_start)));
290            prev_start = s.cursor();
291        };
292
293        // Opening delimiter.
294        self.s.jump(start + backticks);
295        push_raw(SyntaxKind::RawDelim, &self.s);
296
297        if backticks >= 3 {
298            self.blocky_raw(end - backticks, &mut push_raw);
299        } else {
300            self.inline_raw(end - backticks, &mut push_raw);
301        }
302
303        // Closing delimiter.
304        self.s.jump(end);
305        push_raw(SyntaxKind::RawDelim, &self.s);
306
307        (SyntaxKind::Raw, SyntaxNode::inner(SyntaxKind::Raw, nodes))
308    }
309
310    /// Raw blocks parse a language tag, have smart behavior for trimming
311    /// whitespace in the start/end lines, and trim common leading whitespace
312    /// from all other lines as the "dedent". The exact behavior is described
313    /// below.
314    ///
315    /// ### The initial line:
316    /// - A valid Typst identifier immediately following the opening delimiter
317    ///   is parsed as the language tag.
318    /// - We check the rest of the line and if all characters are whitespace,
319    ///   trim it. Otherwise we trim a single leading space if present.
320    ///   - If more trimmed characters follow on future lines, they will be
321    ///     merged into the same trimmed element.
322    /// - If we didn't trim the entire line, the rest is kept as text.
323    ///
324    /// ### Inner lines:
325    /// - We determine the "dedent" by iterating over the lines. The dedent is
326    ///   the minimum number of leading whitespace characters (not bytes) before
327    ///   each line that has any non-whitespace characters.
328    ///   - The opening delimiter's line does not contribute to the dedent, but
329    ///     the closing delimiter's line does (even if that line is entirely
330    ///     whitespace up to the delimiter).
331    /// - We then trim the newline and dedent characters of each line, and add a
332    ///   (potentially empty) text element of all remaining characters.
333    ///
334    /// ### The final line:
335    /// - If the last line is entirely whitespace, it is trimmed.
336    /// - Otherwise its text is kept like an inner line. However, if the last
337    ///   non-whitespace character of the final line is a backtick, then one
338    ///   ascii space (if present) is trimmed from the end.
339    fn blocky_raw<F>(&mut self, inner_end: usize, mut push_raw: F)
340    where
341        F: FnMut(SyntaxKind, &Scanner),
342    {
343        // Language tag.
344        if self.s.eat_if(is_id_start) {
345            self.s.eat_while(is_id_continue);
346            push_raw(SyntaxKind::RawLang, &self.s);
347        }
348
349        // The rest of the function operates on the lines between the backticks.
350        let mut lines = split_newlines(self.s.to(inner_end));
351
352        // Determine dedent level.
353        let dedent = lines
354            .iter()
355            .skip(1)
356            .filter(|line| !line.chars().all(char::is_whitespace))
357            // The line with the closing ``` is always taken into account
358            .chain(lines.last())
359            .map(|line| line.chars().take_while(|c| c.is_whitespace()).count())
360            .min()
361            .unwrap_or(0);
362
363        // Trim whitespace from the last line. Will be added as a `RawTrimmed`
364        // kind by the check for `self.s.cursor() != inner_end` below.
365        if lines.last().is_some_and(|last| last.chars().all(char::is_whitespace)) {
366            lines.pop();
367        } else if let Some(last) = lines.last_mut() {
368            // If last line ends in a backtick, try to trim a single space. This
369            // check must happen before we add the first line since the last and
370            // first lines might be the same.
371            if last.trim_end().ends_with('`') {
372                *last = last.strip_suffix(' ').unwrap_or(last);
373            }
374        }
375
376        let mut lines = lines.into_iter();
377
378        // Handle the first line: trim if all whitespace, or trim a single space
379        // at the start. Note that the first line does not affect the dedent
380        // value.
381        if let Some(first_line) = lines.next() {
382            if first_line.chars().all(char::is_whitespace) {
383                self.s.advance(first_line.len());
384                // This is the only spot we advance the scanner, but don't
385                // immediately call `push_raw`. But the rest of the function
386                // ensures we will always add this text to a `RawTrimmed` later.
387                debug_assert!(self.s.cursor() != inner_end);
388                // A proof by cases follows:
389                // # First case: The loop runs
390                // If the loop runs, there must be a newline following, so
391                // `cursor != inner_end`. And if the loop runs, the first thing
392                // it does is add a trimmed element.
393                // # Second case: The final if-statement runs.
394                // To _not_ reach the loop from here, we must have only one or
395                // two lines:
396                // 1. If one line, we cannot be here, because the first and last
397                //    lines are the same, so this line will have been removed by
398                //    the check for the last line being all whitespace.
399                // 2. If two lines, the loop will run unless the last is fully
400                //    whitespace, but if it is, it will have been popped, then
401                //    the final if-statement will run because the text removed
402                //    by the last line must include at least a newline, so
403                //    `cursor != inner_end` here.
404            } else {
405                let line_end = self.s.cursor() + first_line.len();
406                if self.s.eat_if(' ') {
407                    // Trim a single space after the lang tag on the first line.
408                    push_raw(SyntaxKind::RawTrimmed, &self.s);
409                }
410                // We know here that the rest of the line is non-empty.
411                self.s.jump(line_end);
412                push_raw(SyntaxKind::Text, &self.s);
413            }
414        }
415
416        // Add lines.
417        for line in lines {
418            let offset: usize = line.chars().take(dedent).map(char::len_utf8).sum();
419            self.s.eat_newline();
420            self.s.advance(offset);
421            push_raw(SyntaxKind::RawTrimmed, &self.s);
422            self.s.advance(line.len() - offset);
423            push_raw(SyntaxKind::Text, &self.s);
424        }
425
426        // Add final trimmed.
427        if self.s.cursor() < inner_end {
428            self.s.jump(inner_end);
429            push_raw(SyntaxKind::RawTrimmed, &self.s);
430        }
431    }
432
433    /// Inline raw text is split on lines with non-newlines as `Text` kinds and
434    /// newlines as `RawTrimmed`. Inline raw text does not dedent the text, all
435    /// non-newline whitespace is kept.
436    fn inline_raw<F>(&mut self, inner_end: usize, mut push_raw: F)
437    where
438        F: FnMut(SyntaxKind, &Scanner),
439    {
440        while self.s.cursor() < inner_end {
441            if self.s.at(is_newline) {
442                push_raw(SyntaxKind::Text, &self.s);
443                self.s.eat_newline();
444                push_raw(SyntaxKind::RawTrimmed, &self.s);
445                continue;
446            }
447            self.s.eat();
448        }
449        push_raw(SyntaxKind::Text, &self.s);
450    }
451
452    fn link(&mut self) -> SyntaxKind {
453        let (link, balanced) = link_prefix(self.s.after());
454        self.s.advance(link.len());
455
456        if !balanced {
457            return self.error(
458                "automatic links cannot contain unbalanced brackets, \
459                 use the `link` function instead",
460            );
461        }
462
463        SyntaxKind::Link
464    }
465
466    fn numbering(&mut self, start: usize) -> SyntaxKind {
467        self.s.eat_while(char::is_ascii_digit);
468
469        let read = self.s.from(start);
470        if self.s.eat_if('.') && self.space_or_end() && read.parse::<u64>().is_ok() {
471            return SyntaxKind::EnumMarker;
472        }
473
474        self.text()
475    }
476
477    fn ref_marker(&mut self) -> SyntaxKind {
478        self.s.eat_while(is_valid_in_label_literal);
479
480        // Don't include the trailing characters likely to be part of text.
481        while matches!(self.s.scout(-1), Some('.' | ':')) {
482            self.s.uneat();
483        }
484
485        SyntaxKind::RefMarker
486    }
487
488    fn label(&mut self) -> SyntaxKind {
489        let label = self.s.eat_while(is_valid_in_label_literal);
490        if label.is_empty() {
491            return self.error("label cannot be empty");
492        }
493
494        if !self.s.eat_if('>') {
495            return self.error("unclosed label");
496        }
497
498        SyntaxKind::Label
499    }
500
501    fn text(&mut self) -> SyntaxKind {
502        macro_rules! table {
503            ($(|$c:literal)*) => {
504                static TABLE: [bool; 128] = {
505                    let mut t = [false; 128];
506                    $(t[$c as usize] = true;)*
507                    t
508                };
509            };
510        }
511
512        table! {
513            | ' ' | '\t' | '\n' | '\x0b' | '\x0c' | '\r' | '\\' | '/'
514            | '[' | ']' | '~' | '-' | '.' | '\'' | '"' | '*' | '_'
515            | ':' | 'h' | '`' | '$' | '<' | '>' | '@' | '#'
516        };
517
518        loop {
519            self.s.eat_until(|c: char| {
520                TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace())
521            });
522
523            // Continue with the same text node if the thing would become text
524            // anyway.
525            let mut s = self.s;
526            match s.eat() {
527                Some(' ') if s.at(char::is_alphanumeric) => {}
528                Some('/') if !s.at(['/', '*']) => {}
529                Some('-') if !s.at(['-', '?']) => {}
530                Some('.') if !s.at("..") => {}
531                Some('h') if !s.at("ttp://") && !s.at("ttps://") => {}
532                Some('@') if !s.at(is_valid_in_label_literal) => {}
533                _ => break,
534            }
535
536            self.s = s;
537        }
538
539        SyntaxKind::Text
540    }
541
542    fn in_word(&self) -> bool {
543        let wordy = |c: Option<char>| {
544            c.is_some_and(|c| {
545                c.is_alphanumeric()
546                    && !matches!(
547                        c.script(),
548                        Script::Han
549                            | Script::Hiragana
550                            | Script::Katakana
551                            | Script::Hangul
552                    )
553            })
554        };
555        let prev = self.s.scout(-2);
556        let next = self.s.peek();
557        wordy(prev) && wordy(next)
558    }
559
560    fn space_or_end(&self) -> bool {
561        self.s.done()
562            || self.s.at(char::is_whitespace)
563            || self.s.at("//")
564            || self.s.at("/*")
565    }
566}
567
568/// Math.
569impl Lexer<'_> {
570    fn math(&mut self, start: usize, c: char) -> (SyntaxKind, Option<SyntaxNode>) {
571        let kind = match c {
572            '\\' => self.backslash(),
573            '"' => self.string(),
574
575            '-' if self.s.eat_if(">>") => SyntaxKind::MathShorthand,
576            '-' if self.s.eat_if('>') => SyntaxKind::MathShorthand,
577            '-' if self.s.eat_if("->") => SyntaxKind::MathShorthand,
578            ':' if self.s.eat_if('=') => SyntaxKind::MathShorthand,
579            ':' if self.s.eat_if(":=") => SyntaxKind::MathShorthand,
580            '!' if self.s.eat_if('=') => SyntaxKind::MathShorthand,
581            '.' if self.s.eat_if("..") => SyntaxKind::MathShorthand,
582            '[' if self.s.eat_if('|') => SyntaxKind::MathShorthand,
583            '<' if self.s.eat_if("==>") => SyntaxKind::MathShorthand,
584            '<' if self.s.eat_if("-->") => SyntaxKind::MathShorthand,
585            '<' if self.s.eat_if("--") => SyntaxKind::MathShorthand,
586            '<' if self.s.eat_if("-<") => SyntaxKind::MathShorthand,
587            '<' if self.s.eat_if("->") => SyntaxKind::MathShorthand,
588            '<' if self.s.eat_if("<-") => SyntaxKind::MathShorthand,
589            '<' if self.s.eat_if("<<") => SyntaxKind::MathShorthand,
590            '<' if self.s.eat_if("=>") => SyntaxKind::MathShorthand,
591            '<' if self.s.eat_if("==") => SyntaxKind::MathShorthand,
592            '<' if self.s.eat_if("~~") => SyntaxKind::MathShorthand,
593            '<' if self.s.eat_if('=') => SyntaxKind::MathShorthand,
594            '<' if self.s.eat_if('<') => SyntaxKind::MathShorthand,
595            '<' if self.s.eat_if('-') => SyntaxKind::MathShorthand,
596            '<' if self.s.eat_if('~') => SyntaxKind::MathShorthand,
597            '>' if self.s.eat_if("->") => SyntaxKind::MathShorthand,
598            '>' if self.s.eat_if(">>") => SyntaxKind::MathShorthand,
599            '=' if self.s.eat_if("=>") => SyntaxKind::MathShorthand,
600            '=' if self.s.eat_if('>') => SyntaxKind::MathShorthand,
601            '=' if self.s.eat_if(':') => SyntaxKind::MathShorthand,
602            '>' if self.s.eat_if('=') => SyntaxKind::MathShorthand,
603            '>' if self.s.eat_if('>') => SyntaxKind::MathShorthand,
604            '|' if self.s.eat_if("->") => SyntaxKind::MathShorthand,
605            '|' if self.s.eat_if("=>") => SyntaxKind::MathShorthand,
606            '|' if self.s.eat_if(']') => SyntaxKind::MathShorthand,
607            '|' if self.s.eat_if('|') => SyntaxKind::MathShorthand,
608            '~' if self.s.eat_if("~>") => SyntaxKind::MathShorthand,
609            '~' if self.s.eat_if('>') => SyntaxKind::MathShorthand,
610            '*' | '-' | '~' => SyntaxKind::MathShorthand,
611
612            '.' => SyntaxKind::Dot,
613            ',' => SyntaxKind::Comma,
614            ';' => SyntaxKind::Semicolon,
615            ')' => SyntaxKind::RightParen,
616
617            '#' => SyntaxKind::Hash,
618            '_' => SyntaxKind::Underscore,
619            '$' => SyntaxKind::Dollar,
620            '/' => SyntaxKind::Slash,
621            '^' => SyntaxKind::Hat,
622            '\'' => SyntaxKind::Prime,
623            '&' => SyntaxKind::MathAlignPoint,
624            '√' | '∛' | '∜' => SyntaxKind::Root,
625
626            // Identifiers.
627            c if is_math_id_start(c) && self.s.at(is_math_id_continue) => {
628                self.s.eat_while(is_math_id_continue);
629                let (kind, node) = self.math_ident_or_field(start);
630                return (kind, Some(node));
631            }
632
633            // Other math atoms.
634            _ => self.math_text(start, c),
635        };
636        (kind, None)
637    }
638
639    /// Parse a single `MathIdent` or an entire `FieldAccess`.
640    fn math_ident_or_field(&mut self, start: usize) -> (SyntaxKind, SyntaxNode) {
641        let mut kind = SyntaxKind::MathIdent;
642        let mut node = SyntaxNode::leaf(kind, self.s.from(start));
643        while let Some(ident) = self.maybe_dot_ident() {
644            kind = SyntaxKind::FieldAccess;
645            let field_children = vec![
646                node,
647                SyntaxNode::leaf(SyntaxKind::Dot, '.'),
648                SyntaxNode::leaf(SyntaxKind::Ident, ident),
649            ];
650            node = SyntaxNode::inner(kind, field_children);
651        }
652        (kind, node)
653    }
654
655    /// If at a dot and a math identifier, eat and return the identifier.
656    fn maybe_dot_ident(&mut self) -> Option<&str> {
657        if self.s.scout(1).is_some_and(is_math_id_start) && self.s.eat_if('.') {
658            let ident_start = self.s.cursor();
659            self.s.eat();
660            self.s.eat_while(is_math_id_continue);
661            Some(self.s.from(ident_start))
662        } else {
663            None
664        }
665    }
666
667    fn math_text(&mut self, start: usize, c: char) -> SyntaxKind {
668        // Keep numbers and grapheme clusters together.
669        if c.is_numeric() {
670            self.s.eat_while(char::is_numeric);
671            let mut s = self.s;
672            if s.eat_if('.') && !s.eat_while(char::is_numeric).is_empty() {
673                self.s = s;
674            }
675            SyntaxKind::MathText
676        } else {
677            let len = self
678                .s
679                .get(start..self.s.string().len())
680                .graphemes(true)
681                .next()
682                .map_or(0, str::len);
683            self.s.jump(start + len);
684            if len > c.len_utf8() {
685                // Grapheme clusters are treated as normal text and stay grouped
686                // This may need to change in the future.
687                SyntaxKind::Text
688            } else {
689                SyntaxKind::MathText
690            }
691        }
692    }
693
694    /// Handle named arguments in math function call.
695    pub fn maybe_math_named_arg(&mut self, start: usize) -> Option<SyntaxNode> {
696        let cursor = self.s.cursor();
697        self.s.jump(start);
698        if self.s.eat_if(is_id_start) {
699            self.s.eat_while(is_id_continue);
700            // Check that a colon directly follows the identifier, and not the
701            // `:=` or `::=` math shorthands.
702            if self.s.at(':') && !self.s.at(":=") && !self.s.at("::=") {
703                // Check that the identifier is not just `_`.
704                let node = if self.s.from(start) != "_" {
705                    SyntaxNode::leaf(SyntaxKind::Ident, self.s.from(start))
706                } else {
707                    let msg = SyntaxError::new("expected identifier, found underscore");
708                    SyntaxNode::error(msg, self.s.from(start))
709                };
710                return Some(node);
711            }
712        }
713        self.s.jump(cursor);
714        None
715    }
716
717    /// Handle spread arguments in math function call.
718    pub fn maybe_math_spread_arg(&mut self, start: usize) -> Option<SyntaxNode> {
719        let cursor = self.s.cursor();
720        self.s.jump(start);
721        if self.s.eat_if("..") {
722            // Check that neither a space nor a dot follows the spread syntax.
723            // A dot would clash with the `...` math shorthand.
724            if !self.space_or_end() && !self.s.at('.') {
725                let node = SyntaxNode::leaf(SyntaxKind::Dots, self.s.from(start));
726                return Some(node);
727            }
728        }
729        self.s.jump(cursor);
730        None
731    }
732}
733
734/// Code.
735impl Lexer<'_> {
736    fn code(&mut self, start: usize, c: char) -> SyntaxKind {
737        match c {
738            '<' if self.s.at(is_id_continue) => self.label(),
739            '0'..='9' => self.number(start, c),
740            '.' if self.s.at(char::is_ascii_digit) => self.number(start, c),
741            '"' => self.string(),
742
743            '=' if self.s.eat_if('=') => SyntaxKind::EqEq,
744            '!' if self.s.eat_if('=') => SyntaxKind::ExclEq,
745            '<' if self.s.eat_if('=') => SyntaxKind::LtEq,
746            '>' if self.s.eat_if('=') => SyntaxKind::GtEq,
747            '+' if self.s.eat_if('=') => SyntaxKind::PlusEq,
748            '-' | '\u{2212}' if self.s.eat_if('=') => SyntaxKind::HyphEq,
749            '*' if self.s.eat_if('=') => SyntaxKind::StarEq,
750            '/' if self.s.eat_if('=') => SyntaxKind::SlashEq,
751            '.' if self.s.eat_if('.') => SyntaxKind::Dots,
752            '=' if self.s.eat_if('>') => SyntaxKind::Arrow,
753
754            '{' => SyntaxKind::LeftBrace,
755            '}' => SyntaxKind::RightBrace,
756            '[' => SyntaxKind::LeftBracket,
757            ']' => SyntaxKind::RightBracket,
758            '(' => SyntaxKind::LeftParen,
759            ')' => SyntaxKind::RightParen,
760            '$' => SyntaxKind::Dollar,
761            ',' => SyntaxKind::Comma,
762            ';' => SyntaxKind::Semicolon,
763            ':' => SyntaxKind::Colon,
764            '.' => SyntaxKind::Dot,
765            '+' => SyntaxKind::Plus,
766            '-' | '\u{2212}' => SyntaxKind::Minus,
767            '*' => SyntaxKind::Star,
768            '/' => SyntaxKind::Slash,
769            '=' => SyntaxKind::Eq,
770            '<' => SyntaxKind::Lt,
771            '>' => SyntaxKind::Gt,
772
773            c if is_id_start(c) => self.ident(start),
774
775            c => self.error(eco_format!("the character `{c}` is not valid in code")),
776        }
777    }
778
779    fn ident(&mut self, start: usize) -> SyntaxKind {
780        self.s.eat_while(is_id_continue);
781        let ident = self.s.from(start);
782
783        let prev = self.s.get(0..start);
784        if (!prev.ends_with(['.', '@']) || prev.ends_with(".."))
785            && let Some(keyword) = keyword(ident)
786        {
787            return keyword;
788        }
789
790        if ident == "_" { SyntaxKind::Underscore } else { SyntaxKind::Ident }
791    }
792
793    fn number(&mut self, start: usize, first_c: char) -> SyntaxKind {
794        // Handle alternative integer bases.
795        let base = match first_c {
796            '0' if self.s.eat_if('b') => 2,
797            '0' if self.s.eat_if('o') => 8,
798            '0' if self.s.eat_if('x') => 16,
799            _ => 10,
800        };
801
802        // Read the initial digits.
803        if base == 16 {
804            self.s.eat_while(char::is_ascii_alphanumeric);
805        } else {
806            self.s.eat_while(char::is_ascii_digit);
807        }
808
809        // Read floating point digits and exponents.
810        let mut is_float = false;
811        if base == 10 {
812            // Read digits following a dot. Make sure not to confuse a spread
813            // operator or a method call for the decimal separator.
814            if first_c == '.' {
815                is_float = true; // We already ate the trailing digits above.
816            } else if !self.s.at("..")
817                && !self.s.scout(1).is_some_and(is_id_start)
818                && self.s.eat_if('.')
819            {
820                is_float = true;
821                self.s.eat_while(char::is_ascii_digit);
822            }
823
824            // Read the exponent.
825            if !self.s.at("em") && self.s.eat_if(['e', 'E']) {
826                is_float = true;
827                self.s.eat_if(['+', '-']);
828                self.s.eat_while(char::is_ascii_digit);
829            }
830        }
831
832        let number = self.s.from(start);
833        let suffix = self.s.eat_while(|c: char| c.is_ascii_alphanumeric() || c == '%');
834
835        // Parse large integer literals as floats
836        if base == 10
837            && !is_float
838            && let Err(e) = i64::from_str_radix(number, base)
839            && matches!(e.kind(), IntErrorKind::PosOverflow | IntErrorKind::NegOverflow)
840            && number.parse::<f64>().is_ok()
841        {
842            is_float = true;
843        }
844
845        let mut suffix_result = match suffix {
846            "" => Ok(None),
847            "pt" | "mm" | "cm" | "in" | "deg" | "rad" | "em" | "fr" | "%" => Ok(Some(())),
848            _ => Err(eco_format!("invalid number suffix: {suffix}")),
849        };
850
851        let number_result = if is_float && number.parse::<f64>().is_err() {
852            // The only invalid case should be when a float lacks digits after
853            // the exponent: e.g. `1.2e`, `2.3E-`, or `1EM`.
854            Err(eco_format!("invalid floating point number: {number}"))
855        } else if base == 10 {
856            Ok(())
857        } else {
858            let name = match base {
859                2 => "binary",
860                8 => "octal",
861                16 => "hexadecimal",
862                _ => unreachable!(),
863            };
864            // The index `[2..]` skips the leading `0b`/`0o`/`0x`.
865            match i64::from_str_radix(&number[2..], base) {
866                Ok(_) if suffix.is_empty() => Ok(()),
867                Ok(value) => {
868                    if suffix_result.is_ok() {
869                        suffix_result = Err(eco_format!(
870                            "try using a decimal number: {value}{suffix}"
871                        ));
872                    }
873                    Err(eco_format!("{name} numbers cannot have a suffix"))
874                }
875                Err(_) => Err(eco_format!("invalid {name} number: {number}")),
876            }
877        };
878
879        // Return our number or write an error with helpful hints.
880        match (number_result, suffix_result) {
881            // Valid numbers :D
882            (Ok(()), Ok(None)) if is_float => SyntaxKind::Float,
883            (Ok(()), Ok(None)) => SyntaxKind::Int,
884            (Ok(()), Ok(Some(()))) => SyntaxKind::Numeric,
885            // Invalid numbers :(
886            (Err(number_err), Err(suffix_err)) => {
887                let err = self.error(number_err);
888                self.hint(suffix_err);
889                err
890            }
891            (Ok(()), Err(msg)) | (Err(msg), Ok(_)) => self.error(msg),
892        }
893    }
894
895    fn string(&mut self) -> SyntaxKind {
896        let mut escaped = false;
897        self.s.eat_until(|c| {
898            let stop = c == '"' && !escaped;
899            escaped = c == '\\' && !escaped;
900            stop
901        });
902
903        if !self.s.eat_if('"') {
904            return self.error("unclosed string");
905        }
906
907        SyntaxKind::Str
908    }
909}
910
911/// Try to parse an identifier into a keyword.
912fn keyword(ident: &str) -> Option<SyntaxKind> {
913    Some(match ident {
914        "none" => SyntaxKind::None,
915        "auto" => SyntaxKind::Auto,
916        "true" => SyntaxKind::Bool,
917        "false" => SyntaxKind::Bool,
918        "not" => SyntaxKind::Not,
919        "and" => SyntaxKind::And,
920        "or" => SyntaxKind::Or,
921        "let" => SyntaxKind::Let,
922        "set" => SyntaxKind::Set,
923        "show" => SyntaxKind::Show,
924        "context" => SyntaxKind::Context,
925        "if" => SyntaxKind::If,
926        "else" => SyntaxKind::Else,
927        "for" => SyntaxKind::For,
928        "in" => SyntaxKind::In,
929        "while" => SyntaxKind::While,
930        "break" => SyntaxKind::Break,
931        "continue" => SyntaxKind::Continue,
932        "return" => SyntaxKind::Return,
933        "import" => SyntaxKind::Import,
934        "include" => SyntaxKind::Include,
935        "as" => SyntaxKind::As,
936        _ => return None,
937    })
938}
939
940trait ScannerExt {
941    fn advance(&mut self, by: usize);
942    fn eat_newline(&mut self) -> bool;
943}
944
945impl ScannerExt for Scanner<'_> {
946    fn advance(&mut self, by: usize) {
947        self.jump(self.cursor() + by);
948    }
949
950    fn eat_newline(&mut self) -> bool {
951        let ate = self.eat_if(is_newline);
952        if ate && self.before().ends_with('\r') {
953            self.eat_if('\n');
954        }
955        ate
956    }
957}
958
959/// Whether a character will become a [`SyntaxKind::Space`] token.
960#[inline]
961fn is_space(character: char, mode: SyntaxMode) -> bool {
962    match mode {
963        SyntaxMode::Markup => matches!(character, ' ' | '\t') || is_newline(character),
964        _ => character.is_whitespace(),
965    }
966}
967
968/// Whether a character is interpreted as a newline by Typst.
969#[inline]
970pub fn is_newline(character: char) -> bool {
971    matches!(
972        character,
973        // Line Feed, Vertical Tab, Form Feed, Carriage Return.
974        '\n' | '\x0B' | '\x0C' | '\r' |
975        // Next Line, Line Separator, Paragraph Separator.
976        '\u{0085}' | '\u{2028}' | '\u{2029}'
977    )
978}
979
980/// Extracts a prefix of the text that is a link and also returns whether the
981/// parentheses and brackets in the link were balanced.
982pub fn link_prefix(text: &str) -> (&str, bool) {
983    let mut s = unscanny::Scanner::new(text);
984    let mut brackets = Vec::new();
985
986    #[rustfmt::skip]
987    s.eat_while(|c: char| {
988        match c {
989            | '0' ..= '9'
990            | 'a' ..= 'z'
991            | 'A' ..= 'Z'
992            | '!' | '#' | '$' | '%' | '&' | '*' | '+'
993            | ',' | '-' | '.' | '/' | ':' | ';' | '='
994            | '?' | '@' | '_' | '~' | '\'' => true,
995            '[' => {
996                brackets.push(b'[');
997                true
998            }
999            '(' => {
1000                brackets.push(b'(');
1001                true
1002            }
1003            ']' => brackets.pop() == Some(b'['),
1004            ')' => brackets.pop() == Some(b'('),
1005            _ => false,
1006        }
1007    });
1008
1009    // Don't include the trailing characters likely to be part of text.
1010    while matches!(s.scout(-1), Some('!' | ',' | '.' | ':' | ';' | '?' | '\'')) {
1011        s.uneat();
1012    }
1013
1014    (s.before(), brackets.is_empty())
1015}
1016
1017/// Split text at newlines. These newline characters are not kept.
1018pub fn split_newlines(text: &str) -> Vec<&str> {
1019    let mut s = Scanner::new(text);
1020    let mut lines = Vec::new();
1021    let mut start = 0;
1022    let mut end = 0;
1023
1024    while let Some(c) = s.eat() {
1025        if is_newline(c) {
1026            if c == '\r' {
1027                s.eat_if('\n');
1028            }
1029
1030            lines.push(&text[start..end]);
1031            start = s.cursor();
1032        }
1033        end = s.cursor();
1034    }
1035
1036    lines.push(&text[start..]);
1037    lines
1038}
1039
1040/// Count the number of newlines in text.
1041fn count_newlines(text: &str) -> usize {
1042    let mut newlines = 0;
1043    let mut s = Scanner::new(text);
1044    while let Some(c) = s.eat() {
1045        if is_newline(c) {
1046            if c == '\r' {
1047                s.eat_if('\n');
1048            }
1049            newlines += 1;
1050        }
1051    }
1052    newlines
1053}
1054
1055/// Whether a string is a valid Typst identifier.
1056///
1057/// In addition to what is specified in the [Unicode Standard][uax31], we allow:
1058/// - `_` as a starting character,
1059/// - `_` and `-` as continuing characters.
1060///
1061/// [uax31]: http://www.unicode.org/reports/tr31/
1062#[inline]
1063pub fn is_ident(string: &str) -> bool {
1064    let mut chars = string.chars();
1065    chars
1066        .next()
1067        .is_some_and(|c| is_id_start(c) && chars.all(is_id_continue))
1068}
1069
1070/// Whether a character can start an identifier.
1071#[inline]
1072pub fn is_id_start(c: char) -> bool {
1073    is_xid_start(c) || c == '_'
1074}
1075
1076/// Whether a character can continue an identifier.
1077#[inline]
1078pub fn is_id_continue(c: char) -> bool {
1079    is_xid_continue(c) || c == '_' || c == '-'
1080}
1081
1082/// Whether a character can start an identifier in math.
1083#[inline]
1084fn is_math_id_start(c: char) -> bool {
1085    is_xid_start(c)
1086}
1087
1088/// Whether a character can continue an identifier in math.
1089#[inline]
1090fn is_math_id_continue(c: char) -> bool {
1091    is_xid_continue(c) && c != '_'
1092}
1093
1094/// Whether a character can be part of a label literal's name.
1095#[inline]
1096fn is_valid_in_label_literal(c: char) -> bool {
1097    is_id_continue(c) || matches!(c, ':' | '.')
1098}
1099
1100/// Returns true if this string is valid in a label literal.
1101pub fn is_valid_label_literal_id(id: &str) -> bool {
1102    !id.is_empty() && id.chars().all(is_valid_in_label_literal)
1103}