typst_syntax/
lexer.rs

1use ecow::{EcoString, eco_format};
2use unicode_ident::{is_xid_continue, is_xid_start};
3use unicode_script::{Script, UnicodeScript};
4use unicode_segmentation::UnicodeSegmentation;
5use unscanny::Scanner;
6
7use crate::{SyntaxError, SyntaxKind, SyntaxMode, SyntaxNode};
8
9/// An iterator over a source code string which returns tokens.
10#[derive(Clone)]
11pub(super) struct Lexer<'s> {
12    /// The scanner: contains the underlying string and location as a "cursor".
13    s: Scanner<'s>,
14    /// The mode the lexer is in. This determines which kinds of tokens it
15    /// produces.
16    mode: SyntaxMode,
17    /// Whether the last token contained a newline.
18    newline: bool,
19    /// An error for the last token.
20    error: Option<SyntaxError>,
21}
22
23impl<'s> Lexer<'s> {
24    /// Create a new lexer with the given mode and a prefix to offset column
25    /// calculations.
26    pub fn new(text: &'s str, mode: SyntaxMode) -> Self {
27        Self {
28            s: Scanner::new(text),
29            mode,
30            newline: false,
31            error: None,
32        }
33    }
34
35    /// Get the current lexing mode.
36    pub fn mode(&self) -> SyntaxMode {
37        self.mode
38    }
39
40    /// Change the lexing mode.
41    pub fn set_mode(&mut self, mode: SyntaxMode) {
42        self.mode = mode;
43    }
44
45    /// The index in the string at which the last token ends and next token
46    /// will start.
47    pub fn cursor(&self) -> usize {
48        self.s.cursor()
49    }
50
51    /// Jump to the given index in the string.
52    pub fn jump(&mut self, index: usize) {
53        self.s.jump(index);
54    }
55
56    /// Whether the last token contained a newline.
57    pub fn newline(&self) -> bool {
58        self.newline
59    }
60
61    /// The number of characters until the most recent newline from an index.
62    pub fn column(&self, index: usize) -> usize {
63        let mut s = self.s; // Make a new temporary scanner (cheap).
64        s.jump(index);
65        s.before().chars().rev().take_while(|&c| !is_newline(c)).count()
66    }
67}
68
69impl Lexer<'_> {
70    /// Construct a full-positioned syntax error.
71    fn error(&mut self, message: impl Into<EcoString>) -> SyntaxKind {
72        self.error = Some(SyntaxError::new(message));
73        SyntaxKind::Error
74    }
75
76    /// If the current node is an error, adds a hint.
77    fn hint(&mut self, message: impl Into<EcoString>) {
78        if let Some(error) = &mut self.error {
79            error.hints.push(message.into());
80        }
81    }
82}
83
84/// Shared methods with all [`SyntaxMode`].
85impl Lexer<'_> {
86    /// Return the next token in our text. Returns both the [`SyntaxNode`]
87    /// and the raw [`SyntaxKind`] to make it more ergonomic to check the kind
88    pub fn next(&mut self) -> (SyntaxKind, SyntaxNode) {
89        debug_assert!(self.error.is_none());
90        let start = self.s.cursor();
91
92        self.newline = false;
93        let kind = match self.s.eat() {
94            Some(c) if is_space(c, self.mode) => self.whitespace(start, c),
95            Some('#') if start == 0 && self.s.eat_if('!') => self.shebang(),
96            Some('/') if self.s.eat_if('/') => self.line_comment(),
97            Some('/') if self.s.eat_if('*') => self.block_comment(),
98            Some('*') if self.s.eat_if('/') => {
99                let kind = self.error("unexpected end of block comment");
100                self.hint(
101                    "consider escaping the `*` with a backslash or \
102                     opening the block comment with `/*`",
103                );
104                kind
105            }
106            Some('`') if self.mode != SyntaxMode::Math => return self.raw(),
107            Some(c) => match self.mode {
108                SyntaxMode::Markup => self.markup(start, c),
109                SyntaxMode::Math => match self.math(start, c) {
110                    (kind, None) => kind,
111                    (kind, Some(node)) => return (kind, node),
112                },
113                SyntaxMode::Code => self.code(start, c),
114            },
115
116            None => SyntaxKind::End,
117        };
118
119        let text = self.s.from(start);
120        let node = match self.error.take() {
121            Some(error) => SyntaxNode::error(error, text),
122            None => SyntaxNode::leaf(kind, text),
123        };
124        (kind, node)
125    }
126
127    /// Eat whitespace characters greedily.
128    fn whitespace(&mut self, start: usize, c: char) -> SyntaxKind {
129        let more = self.s.eat_while(|c| is_space(c, self.mode));
130        let newlines = match c {
131            // Optimize eating a single space.
132            ' ' if more.is_empty() => 0,
133            _ => count_newlines(self.s.from(start)),
134        };
135
136        self.newline = newlines > 0;
137        if self.mode == SyntaxMode::Markup && newlines >= 2 {
138            SyntaxKind::Parbreak
139        } else {
140            SyntaxKind::Space
141        }
142    }
143
144    fn shebang(&mut self) -> SyntaxKind {
145        self.s.eat_until(is_newline);
146        SyntaxKind::Shebang
147    }
148
149    fn line_comment(&mut self) -> SyntaxKind {
150        self.s.eat_until(is_newline);
151        SyntaxKind::LineComment
152    }
153
154    fn block_comment(&mut self) -> SyntaxKind {
155        let mut state = '_';
156        let mut depth = 1;
157
158        // Find the first `*/` that does not correspond to a nested `/*`.
159        while let Some(c) = self.s.eat() {
160            state = match (state, c) {
161                ('*', '/') => {
162                    depth -= 1;
163                    if depth == 0 {
164                        break;
165                    }
166                    '_'
167                }
168                ('/', '*') => {
169                    depth += 1;
170                    '_'
171                }
172                _ => c,
173            }
174        }
175
176        SyntaxKind::BlockComment
177    }
178}
179
180/// Markup.
181impl Lexer<'_> {
182    fn markup(&mut self, start: usize, c: char) -> SyntaxKind {
183        match c {
184            '\\' => self.backslash(),
185            'h' if self.s.eat_if("ttp://") => self.link(),
186            'h' if self.s.eat_if("ttps://") => self.link(),
187            '<' if self.s.at(is_id_continue) => self.label(),
188            '@' if self.s.at(is_id_continue) => self.ref_marker(),
189
190            '.' if self.s.eat_if("..") => SyntaxKind::Shorthand,
191            '-' if self.s.eat_if("--") => SyntaxKind::Shorthand,
192            '-' if self.s.eat_if('-') => SyntaxKind::Shorthand,
193            '-' if self.s.eat_if('?') => SyntaxKind::Shorthand,
194            '-' if self.s.at(char::is_numeric) => SyntaxKind::Shorthand,
195            '*' if !self.in_word() => SyntaxKind::Star,
196            '_' if !self.in_word() => SyntaxKind::Underscore,
197
198            '#' => SyntaxKind::Hash,
199            '[' => SyntaxKind::LeftBracket,
200            ']' => SyntaxKind::RightBracket,
201            '\'' => SyntaxKind::SmartQuote,
202            '"' => SyntaxKind::SmartQuote,
203            '$' => SyntaxKind::Dollar,
204            '~' => SyntaxKind::Shorthand,
205            ':' => SyntaxKind::Colon,
206            '=' => {
207                self.s.eat_while('=');
208                if self.space_or_end() { SyntaxKind::HeadingMarker } else { self.text() }
209            }
210            '-' if self.space_or_end() => SyntaxKind::ListMarker,
211            '+' if self.space_or_end() => SyntaxKind::EnumMarker,
212            '/' if self.space_or_end() => SyntaxKind::TermMarker,
213            '0'..='9' => self.numbering(start),
214
215            _ => self.text(),
216        }
217    }
218
219    fn backslash(&mut self) -> SyntaxKind {
220        if self.s.eat_if("u{") {
221            let hex = self.s.eat_while(char::is_ascii_alphanumeric);
222            if !self.s.eat_if('}') {
223                return self.error("unclosed Unicode escape sequence");
224            }
225
226            if u32::from_str_radix(hex, 16)
227                .ok()
228                .and_then(std::char::from_u32)
229                .is_none()
230            {
231                return self.error(eco_format!("invalid Unicode codepoint: {}", hex));
232            }
233
234            return SyntaxKind::Escape;
235        }
236
237        if self.s.done() || self.s.at(char::is_whitespace) {
238            SyntaxKind::Linebreak
239        } else {
240            self.s.eat();
241            SyntaxKind::Escape
242        }
243    }
244
245    /// We parse entire raw segments in the lexer as a convenience to avoid
246    /// going to and from the parser for each raw section. See comments in
247    /// [`Self::blocky_raw`] and [`Self::inline_raw`] for specific details.
248    fn raw(&mut self) -> (SyntaxKind, SyntaxNode) {
249        let start = self.s.cursor() - 1;
250
251        // Determine number of opening backticks.
252        let mut backticks = 1;
253        while self.s.eat_if('`') {
254            backticks += 1;
255        }
256
257        // Special case for ``.
258        if backticks == 2 {
259            let nodes = vec![
260                SyntaxNode::leaf(SyntaxKind::RawDelim, "`"),
261                SyntaxNode::leaf(SyntaxKind::RawDelim, "`"),
262            ];
263            return (SyntaxKind::Raw, SyntaxNode::inner(SyntaxKind::Raw, nodes));
264        }
265
266        // Find end of raw text.
267        let mut found = 0;
268        while found < backticks {
269            match self.s.eat() {
270                Some('`') => found += 1,
271                Some(_) => found = 0,
272                None => {
273                    let msg = SyntaxError::new("unclosed raw text");
274                    let error = SyntaxNode::error(msg, self.s.from(start));
275                    return (SyntaxKind::Error, error);
276                }
277            }
278        }
279        let end = self.s.cursor();
280
281        let mut nodes = Vec::with_capacity(3); // Will have at least 3.
282
283        // A closure for pushing a node onto our raw vector. Assumes the caller
284        // will move the scanner to the next location at each step.
285        let mut prev_start = start;
286        let mut push_raw = |kind, s: &Scanner| {
287            nodes.push(SyntaxNode::leaf(kind, s.from(prev_start)));
288            prev_start = s.cursor();
289        };
290
291        // Opening delimiter.
292        self.s.jump(start + backticks);
293        push_raw(SyntaxKind::RawDelim, &self.s);
294
295        if backticks >= 3 {
296            self.blocky_raw(end - backticks, &mut push_raw);
297        } else {
298            self.inline_raw(end - backticks, &mut push_raw);
299        }
300
301        // Closing delimiter.
302        self.s.jump(end);
303        push_raw(SyntaxKind::RawDelim, &self.s);
304
305        (SyntaxKind::Raw, SyntaxNode::inner(SyntaxKind::Raw, nodes))
306    }
307
308    /// Raw blocks parse a language tag, have smart behavior for trimming
309    /// whitespace in the start/end lines, and trim common leading whitespace
310    /// from all other lines as the "dedent". The exact behavior is described
311    /// below.
312    ///
313    /// ### The initial line:
314    /// - A valid Typst identifier immediately following the opening delimiter
315    ///   is parsed as the language tag.
316    /// - We check the rest of the line and if all characters are whitespace,
317    ///   trim it. Otherwise we trim a single leading space if present.
318    ///   - If more trimmed characters follow on future lines, they will be
319    ///     merged into the same trimmed element.
320    /// - If we didn't trim the entire line, the rest is kept as text.
321    ///
322    /// ### Inner lines:
323    /// - We determine the "dedent" by iterating over the lines. The dedent is
324    ///   the minimum number of leading whitespace characters (not bytes) before
325    ///   each line that has any non-whitespace characters.
326    ///   - The opening delimiter's line does not contribute to the dedent, but
327    ///     the closing delimiter's line does (even if that line is entirely
328    ///     whitespace up to the delimiter).
329    /// - We then trim the newline and dedent characters of each line, and add a
330    ///   (potentially empty) text element of all remaining characters.
331    ///
332    /// ### The final line:
333    /// - If the last line is entirely whitespace, it is trimmed.
334    /// - Otherwise its text is kept like an inner line. However, if the last
335    ///   non-whitespace character of the final line is a backtick, then one
336    ///   ascii space (if present) is trimmed from the end.
337    fn blocky_raw<F>(&mut self, inner_end: usize, mut push_raw: F)
338    where
339        F: FnMut(SyntaxKind, &Scanner),
340    {
341        // Language tag.
342        if self.s.eat_if(is_id_start) {
343            self.s.eat_while(is_id_continue);
344            push_raw(SyntaxKind::RawLang, &self.s);
345        }
346
347        // The rest of the function operates on the lines between the backticks.
348        let mut lines = split_newlines(self.s.to(inner_end));
349
350        // Determine dedent level.
351        let dedent = lines
352            .iter()
353            .skip(1)
354            .filter(|line| !line.chars().all(char::is_whitespace))
355            // The line with the closing ``` is always taken into account
356            .chain(lines.last())
357            .map(|line| line.chars().take_while(|c| c.is_whitespace()).count())
358            .min()
359            .unwrap_or(0);
360
361        // Trim whitespace from the last line. Will be added as a `RawTrimmed`
362        // kind by the check for `self.s.cursor() != inner_end` below.
363        if lines.last().is_some_and(|last| last.chars().all(char::is_whitespace)) {
364            lines.pop();
365        } else if let Some(last) = lines.last_mut() {
366            // If last line ends in a backtick, try to trim a single space. This
367            // check must happen before we add the first line since the last and
368            // first lines might be the same.
369            if last.trim_end().ends_with('`') {
370                *last = last.strip_suffix(' ').unwrap_or(last);
371            }
372        }
373
374        let mut lines = lines.into_iter();
375
376        // Handle the first line: trim if all whitespace, or trim a single space
377        // at the start. Note that the first line does not affect the dedent
378        // value.
379        if let Some(first_line) = lines.next() {
380            if first_line.chars().all(char::is_whitespace) {
381                self.s.advance(first_line.len());
382                // This is the only spot we advance the scanner, but don't
383                // immediately call `push_raw`. But the rest of the function
384                // ensures we will always add this text to a `RawTrimmed` later.
385                debug_assert!(self.s.cursor() != inner_end);
386                // A proof by cases follows:
387                // # First case: The loop runs
388                // If the loop runs, there must be a newline following, so
389                // `cursor != inner_end`. And if the loop runs, the first thing
390                // it does is add a trimmed element.
391                // # Second case: The final if-statement runs.
392                // To _not_ reach the loop from here, we must have only one or
393                // two lines:
394                // 1. If one line, we cannot be here, because the first and last
395                //    lines are the same, so this line will have been removed by
396                //    the check for the last line being all whitespace.
397                // 2. If two lines, the loop will run unless the last is fully
398                //    whitespace, but if it is, it will have been popped, then
399                //    the final if-statement will run because the text removed
400                //    by the last line must include at least a newline, so
401                //    `cursor != inner_end` here.
402            } else {
403                let line_end = self.s.cursor() + first_line.len();
404                if self.s.eat_if(' ') {
405                    // Trim a single space after the lang tag on the first line.
406                    push_raw(SyntaxKind::RawTrimmed, &self.s);
407                }
408                // We know here that the rest of the line is non-empty.
409                self.s.jump(line_end);
410                push_raw(SyntaxKind::Text, &self.s);
411            }
412        }
413
414        // Add lines.
415        for line in lines {
416            let offset: usize = line.chars().take(dedent).map(char::len_utf8).sum();
417            self.s.eat_newline();
418            self.s.advance(offset);
419            push_raw(SyntaxKind::RawTrimmed, &self.s);
420            self.s.advance(line.len() - offset);
421            push_raw(SyntaxKind::Text, &self.s);
422        }
423
424        // Add final trimmed.
425        if self.s.cursor() < inner_end {
426            self.s.jump(inner_end);
427            push_raw(SyntaxKind::RawTrimmed, &self.s);
428        }
429    }
430
431    /// Inline raw text is split on lines with non-newlines as `Text` kinds and
432    /// newlines as `RawTrimmed`. Inline raw text does not dedent the text, all
433    /// non-newline whitespace is kept.
434    fn inline_raw<F>(&mut self, inner_end: usize, mut push_raw: F)
435    where
436        F: FnMut(SyntaxKind, &Scanner),
437    {
438        while self.s.cursor() < inner_end {
439            if self.s.at(is_newline) {
440                push_raw(SyntaxKind::Text, &self.s);
441                self.s.eat_newline();
442                push_raw(SyntaxKind::RawTrimmed, &self.s);
443                continue;
444            }
445            self.s.eat();
446        }
447        push_raw(SyntaxKind::Text, &self.s);
448    }
449
450    fn link(&mut self) -> SyntaxKind {
451        let (link, balanced) = link_prefix(self.s.after());
452        self.s.advance(link.len());
453
454        if !balanced {
455            return self.error(
456                "automatic links cannot contain unbalanced brackets, \
457                 use the `link` function instead",
458            );
459        }
460
461        SyntaxKind::Link
462    }
463
464    fn numbering(&mut self, start: usize) -> SyntaxKind {
465        self.s.eat_while(char::is_ascii_digit);
466
467        let read = self.s.from(start);
468        if self.s.eat_if('.') && self.space_or_end() && read.parse::<u64>().is_ok() {
469            return SyntaxKind::EnumMarker;
470        }
471
472        self.text()
473    }
474
475    fn ref_marker(&mut self) -> SyntaxKind {
476        self.s.eat_while(is_valid_in_label_literal);
477
478        // Don't include the trailing characters likely to be part of text.
479        while matches!(self.s.scout(-1), Some('.' | ':')) {
480            self.s.uneat();
481        }
482
483        SyntaxKind::RefMarker
484    }
485
486    fn label(&mut self) -> SyntaxKind {
487        let label = self.s.eat_while(is_valid_in_label_literal);
488        if label.is_empty() {
489            return self.error("label cannot be empty");
490        }
491
492        if !self.s.eat_if('>') {
493            return self.error("unclosed label");
494        }
495
496        SyntaxKind::Label
497    }
498
499    fn text(&mut self) -> SyntaxKind {
500        macro_rules! table {
501            ($(|$c:literal)*) => {
502                static TABLE: [bool; 128] = {
503                    let mut t = [false; 128];
504                    $(t[$c as usize] = true;)*
505                    t
506                };
507            };
508        }
509
510        table! {
511            | ' ' | '\t' | '\n' | '\x0b' | '\x0c' | '\r' | '\\' | '/'
512            | '[' | ']' | '~' | '-' | '.' | '\'' | '"' | '*' | '_'
513            | ':' | 'h' | '`' | '$' | '<' | '>' | '@' | '#'
514        };
515
516        loop {
517            self.s.eat_until(|c: char| {
518                TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace())
519            });
520
521            // Continue with the same text node if the thing would become text
522            // anyway.
523            let mut s = self.s;
524            match s.eat() {
525                Some(' ') if s.at(char::is_alphanumeric) => {}
526                Some('/') if !s.at(['/', '*']) => {}
527                Some('-') if !s.at(['-', '?']) => {}
528                Some('.') if !s.at("..") => {}
529                Some('h') if !s.at("ttp://") && !s.at("ttps://") => {}
530                Some('@') if !s.at(is_valid_in_label_literal) => {}
531                _ => break,
532            }
533
534            self.s = s;
535        }
536
537        SyntaxKind::Text
538    }
539
540    fn in_word(&self) -> bool {
541        let wordy = |c: Option<char>| {
542            c.is_some_and(|c| {
543                c.is_alphanumeric()
544                    && !matches!(
545                        c.script(),
546                        Script::Han
547                            | Script::Hiragana
548                            | Script::Katakana
549                            | Script::Hangul
550                    )
551            })
552        };
553        let prev = self.s.scout(-2);
554        let next = self.s.peek();
555        wordy(prev) && wordy(next)
556    }
557
558    fn space_or_end(&self) -> bool {
559        self.s.done()
560            || self.s.at(char::is_whitespace)
561            || self.s.at("//")
562            || self.s.at("/*")
563    }
564}
565
566/// Math.
567impl Lexer<'_> {
568    fn math(&mut self, start: usize, c: char) -> (SyntaxKind, Option<SyntaxNode>) {
569        let kind = match c {
570            '\\' => self.backslash(),
571            '"' => self.string(),
572
573            '-' if self.s.eat_if(">>") => SyntaxKind::MathShorthand,
574            '-' if self.s.eat_if('>') => SyntaxKind::MathShorthand,
575            '-' if self.s.eat_if("->") => SyntaxKind::MathShorthand,
576            ':' if self.s.eat_if('=') => SyntaxKind::MathShorthand,
577            ':' if self.s.eat_if(":=") => SyntaxKind::MathShorthand,
578            '!' if self.s.eat_if('=') => SyntaxKind::MathShorthand,
579            '.' if self.s.eat_if("..") => SyntaxKind::MathShorthand,
580            '[' if self.s.eat_if('|') => SyntaxKind::MathShorthand,
581            '<' if self.s.eat_if("==>") => SyntaxKind::MathShorthand,
582            '<' if self.s.eat_if("-->") => SyntaxKind::MathShorthand,
583            '<' if self.s.eat_if("--") => SyntaxKind::MathShorthand,
584            '<' if self.s.eat_if("-<") => SyntaxKind::MathShorthand,
585            '<' if self.s.eat_if("->") => SyntaxKind::MathShorthand,
586            '<' if self.s.eat_if("<-") => SyntaxKind::MathShorthand,
587            '<' if self.s.eat_if("<<") => SyntaxKind::MathShorthand,
588            '<' if self.s.eat_if("=>") => SyntaxKind::MathShorthand,
589            '<' if self.s.eat_if("==") => SyntaxKind::MathShorthand,
590            '<' if self.s.eat_if("~~") => SyntaxKind::MathShorthand,
591            '<' if self.s.eat_if('=') => SyntaxKind::MathShorthand,
592            '<' if self.s.eat_if('<') => SyntaxKind::MathShorthand,
593            '<' if self.s.eat_if('-') => SyntaxKind::MathShorthand,
594            '<' if self.s.eat_if('~') => SyntaxKind::MathShorthand,
595            '>' if self.s.eat_if("->") => SyntaxKind::MathShorthand,
596            '>' if self.s.eat_if(">>") => SyntaxKind::MathShorthand,
597            '=' if self.s.eat_if("=>") => SyntaxKind::MathShorthand,
598            '=' if self.s.eat_if('>') => SyntaxKind::MathShorthand,
599            '=' if self.s.eat_if(':') => SyntaxKind::MathShorthand,
600            '>' if self.s.eat_if('=') => SyntaxKind::MathShorthand,
601            '>' if self.s.eat_if('>') => SyntaxKind::MathShorthand,
602            '|' if self.s.eat_if("->") => SyntaxKind::MathShorthand,
603            '|' if self.s.eat_if("=>") => SyntaxKind::MathShorthand,
604            '|' if self.s.eat_if(']') => SyntaxKind::MathShorthand,
605            '|' if self.s.eat_if('|') => SyntaxKind::MathShorthand,
606            '~' if self.s.eat_if("~>") => SyntaxKind::MathShorthand,
607            '~' if self.s.eat_if('>') => SyntaxKind::MathShorthand,
608            '*' | '-' | '~' => SyntaxKind::MathShorthand,
609
610            '.' => SyntaxKind::Dot,
611            ',' => SyntaxKind::Comma,
612            ';' => SyntaxKind::Semicolon,
613            ')' => SyntaxKind::RightParen,
614
615            '#' => SyntaxKind::Hash,
616            '_' => SyntaxKind::Underscore,
617            '$' => SyntaxKind::Dollar,
618            '/' => SyntaxKind::Slash,
619            '^' => SyntaxKind::Hat,
620            '\'' => SyntaxKind::Prime,
621            '&' => SyntaxKind::MathAlignPoint,
622            '√' | '∛' | '∜' => SyntaxKind::Root,
623
624            // Identifiers.
625            c if is_math_id_start(c) && self.s.at(is_math_id_continue) => {
626                self.s.eat_while(is_math_id_continue);
627                let (kind, node) = self.math_ident_or_field(start);
628                return (kind, Some(node));
629            }
630
631            // Other math atoms.
632            _ => self.math_text(start, c),
633        };
634        (kind, None)
635    }
636
637    /// Parse a single `MathIdent` or an entire `FieldAccess`.
638    fn math_ident_or_field(&mut self, start: usize) -> (SyntaxKind, SyntaxNode) {
639        let mut kind = SyntaxKind::MathIdent;
640        let mut node = SyntaxNode::leaf(kind, self.s.from(start));
641        while let Some(ident) = self.maybe_dot_ident() {
642            kind = SyntaxKind::FieldAccess;
643            let field_children = vec![
644                node,
645                SyntaxNode::leaf(SyntaxKind::Dot, '.'),
646                SyntaxNode::leaf(SyntaxKind::Ident, ident),
647            ];
648            node = SyntaxNode::inner(kind, field_children);
649        }
650        (kind, node)
651    }
652
653    /// If at a dot and a math identifier, eat and return the identifier.
654    fn maybe_dot_ident(&mut self) -> Option<&str> {
655        if self.s.scout(1).is_some_and(is_math_id_start) && self.s.eat_if('.') {
656            let ident_start = self.s.cursor();
657            self.s.eat();
658            self.s.eat_while(is_math_id_continue);
659            Some(self.s.from(ident_start))
660        } else {
661            None
662        }
663    }
664
665    fn math_text(&mut self, start: usize, c: char) -> SyntaxKind {
666        // Keep numbers and grapheme clusters together.
667        if c.is_numeric() {
668            self.s.eat_while(char::is_numeric);
669            let mut s = self.s;
670            if s.eat_if('.') && !s.eat_while(char::is_numeric).is_empty() {
671                self.s = s;
672            }
673            SyntaxKind::MathText
674        } else {
675            let len = self
676                .s
677                .get(start..self.s.string().len())
678                .graphemes(true)
679                .next()
680                .map_or(0, str::len);
681            self.s.jump(start + len);
682            if len > c.len_utf8() {
683                // Grapheme clusters are treated as normal text and stay grouped
684                // This may need to change in the future.
685                SyntaxKind::Text
686            } else {
687                SyntaxKind::MathText
688            }
689        }
690    }
691
692    /// Handle named arguments in math function call.
693    pub fn maybe_math_named_arg(&mut self, start: usize) -> Option<SyntaxNode> {
694        let cursor = self.s.cursor();
695        self.s.jump(start);
696        if self.s.eat_if(is_id_start) {
697            self.s.eat_while(is_id_continue);
698            // Check that a colon directly follows the identifier, and not the
699            // `:=` or `::=` math shorthands.
700            if self.s.at(':') && !self.s.at(":=") && !self.s.at("::=") {
701                // Check that the identifier is not just `_`.
702                let node = if self.s.from(start) != "_" {
703                    SyntaxNode::leaf(SyntaxKind::Ident, self.s.from(start))
704                } else {
705                    let msg = SyntaxError::new("expected identifier, found underscore");
706                    SyntaxNode::error(msg, self.s.from(start))
707                };
708                return Some(node);
709            }
710        }
711        self.s.jump(cursor);
712        None
713    }
714
715    /// Handle spread arguments in math function call.
716    pub fn maybe_math_spread_arg(&mut self, start: usize) -> Option<SyntaxNode> {
717        let cursor = self.s.cursor();
718        self.s.jump(start);
719        if self.s.eat_if("..") {
720            // Check that neither a space nor a dot follows the spread syntax.
721            // A dot would clash with the `...` math shorthand.
722            if !self.space_or_end() && !self.s.at('.') {
723                let node = SyntaxNode::leaf(SyntaxKind::Dots, self.s.from(start));
724                return Some(node);
725            }
726        }
727        self.s.jump(cursor);
728        None
729    }
730}
731
732/// Code.
733impl Lexer<'_> {
734    fn code(&mut self, start: usize, c: char) -> SyntaxKind {
735        match c {
736            '<' if self.s.at(is_id_continue) => self.label(),
737            '0'..='9' => self.number(start, c),
738            '.' if self.s.at(char::is_ascii_digit) => self.number(start, c),
739            '"' => self.string(),
740
741            '=' if self.s.eat_if('=') => SyntaxKind::EqEq,
742            '!' if self.s.eat_if('=') => SyntaxKind::ExclEq,
743            '<' if self.s.eat_if('=') => SyntaxKind::LtEq,
744            '>' if self.s.eat_if('=') => SyntaxKind::GtEq,
745            '+' if self.s.eat_if('=') => SyntaxKind::PlusEq,
746            '-' | '\u{2212}' if self.s.eat_if('=') => SyntaxKind::HyphEq,
747            '*' if self.s.eat_if('=') => SyntaxKind::StarEq,
748            '/' if self.s.eat_if('=') => SyntaxKind::SlashEq,
749            '.' if self.s.eat_if('.') => SyntaxKind::Dots,
750            '=' if self.s.eat_if('>') => SyntaxKind::Arrow,
751
752            '{' => SyntaxKind::LeftBrace,
753            '}' => SyntaxKind::RightBrace,
754            '[' => SyntaxKind::LeftBracket,
755            ']' => SyntaxKind::RightBracket,
756            '(' => SyntaxKind::LeftParen,
757            ')' => SyntaxKind::RightParen,
758            '$' => SyntaxKind::Dollar,
759            ',' => SyntaxKind::Comma,
760            ';' => SyntaxKind::Semicolon,
761            ':' => SyntaxKind::Colon,
762            '.' => SyntaxKind::Dot,
763            '+' => SyntaxKind::Plus,
764            '-' | '\u{2212}' => SyntaxKind::Minus,
765            '*' => SyntaxKind::Star,
766            '/' => SyntaxKind::Slash,
767            '=' => SyntaxKind::Eq,
768            '<' => SyntaxKind::Lt,
769            '>' => SyntaxKind::Gt,
770
771            c if is_id_start(c) => self.ident(start),
772
773            c => self.error(eco_format!("the character `{c}` is not valid in code")),
774        }
775    }
776
777    fn ident(&mut self, start: usize) -> SyntaxKind {
778        self.s.eat_while(is_id_continue);
779        let ident = self.s.from(start);
780
781        let prev = self.s.get(0..start);
782        if (!prev.ends_with(['.', '@']) || prev.ends_with(".."))
783            && let Some(keyword) = keyword(ident)
784        {
785            return keyword;
786        }
787
788        if ident == "_" { SyntaxKind::Underscore } else { SyntaxKind::Ident }
789    }
790
791    fn number(&mut self, start: usize, first_c: char) -> SyntaxKind {
792        // Handle alternative integer bases.
793        let base = match first_c {
794            '0' if self.s.eat_if('b') => 2,
795            '0' if self.s.eat_if('o') => 8,
796            '0' if self.s.eat_if('x') => 16,
797            _ => 10,
798        };
799
800        // Read the initial digits.
801        if base == 16 {
802            self.s.eat_while(char::is_ascii_alphanumeric);
803        } else {
804            self.s.eat_while(char::is_ascii_digit);
805        }
806
807        // Read floating point digits and exponents.
808        let mut is_float = false;
809        if base == 10 {
810            // Read digits following a dot. Make sure not to confuse a spread
811            // operator or a method call for the decimal separator.
812            if first_c == '.' {
813                is_float = true; // We already ate the trailing digits above.
814            } else if !self.s.at("..")
815                && !self.s.scout(1).is_some_and(is_id_start)
816                && self.s.eat_if('.')
817            {
818                is_float = true;
819                self.s.eat_while(char::is_ascii_digit);
820            }
821
822            // Read the exponent.
823            if !self.s.at("em") && self.s.eat_if(['e', 'E']) {
824                is_float = true;
825                self.s.eat_if(['+', '-']);
826                self.s.eat_while(char::is_ascii_digit);
827            }
828        }
829
830        let number = self.s.from(start);
831        let suffix = self.s.eat_while(|c: char| c.is_ascii_alphanumeric() || c == '%');
832
833        let mut suffix_result = match suffix {
834            "" => Ok(None),
835            "pt" | "mm" | "cm" | "in" | "deg" | "rad" | "em" | "fr" | "%" => Ok(Some(())),
836            _ => Err(eco_format!("invalid number suffix: {suffix}")),
837        };
838
839        let number_result = if is_float && number.parse::<f64>().is_err() {
840            // The only invalid case should be when a float lacks digits after
841            // the exponent: e.g. `1.2e`, `2.3E-`, or `1EM`.
842            Err(eco_format!("invalid floating point number: {number}"))
843        } else if base == 10 {
844            Ok(())
845        } else {
846            let name = match base {
847                2 => "binary",
848                8 => "octal",
849                16 => "hexadecimal",
850                _ => unreachable!(),
851            };
852            // The index `[2..]` skips the leading `0b`/`0o`/`0x`.
853            match i64::from_str_radix(&number[2..], base) {
854                Ok(_) if suffix.is_empty() => Ok(()),
855                Ok(value) => {
856                    if suffix_result.is_ok() {
857                        suffix_result = Err(eco_format!(
858                            "try using a decimal number: {value}{suffix}"
859                        ));
860                    }
861                    Err(eco_format!("{name} numbers cannot have a suffix"))
862                }
863                Err(_) => Err(eco_format!("invalid {name} number: {number}")),
864            }
865        };
866
867        // Return our number or write an error with helpful hints.
868        match (number_result, suffix_result) {
869            // Valid numbers :D
870            (Ok(()), Ok(None)) if is_float => SyntaxKind::Float,
871            (Ok(()), Ok(None)) => SyntaxKind::Int,
872            (Ok(()), Ok(Some(()))) => SyntaxKind::Numeric,
873            // Invalid numbers :(
874            (Err(number_err), Err(suffix_err)) => {
875                let err = self.error(number_err);
876                self.hint(suffix_err);
877                err
878            }
879            (Ok(()), Err(msg)) | (Err(msg), Ok(_)) => self.error(msg),
880        }
881    }
882
883    fn string(&mut self) -> SyntaxKind {
884        let mut escaped = false;
885        self.s.eat_until(|c| {
886            let stop = c == '"' && !escaped;
887            escaped = c == '\\' && !escaped;
888            stop
889        });
890
891        if !self.s.eat_if('"') {
892            return self.error("unclosed string");
893        }
894
895        SyntaxKind::Str
896    }
897}
898
899/// Try to parse an identifier into a keyword.
900fn keyword(ident: &str) -> Option<SyntaxKind> {
901    Some(match ident {
902        "none" => SyntaxKind::None,
903        "auto" => SyntaxKind::Auto,
904        "true" => SyntaxKind::Bool,
905        "false" => SyntaxKind::Bool,
906        "not" => SyntaxKind::Not,
907        "and" => SyntaxKind::And,
908        "or" => SyntaxKind::Or,
909        "let" => SyntaxKind::Let,
910        "set" => SyntaxKind::Set,
911        "show" => SyntaxKind::Show,
912        "context" => SyntaxKind::Context,
913        "if" => SyntaxKind::If,
914        "else" => SyntaxKind::Else,
915        "for" => SyntaxKind::For,
916        "in" => SyntaxKind::In,
917        "while" => SyntaxKind::While,
918        "break" => SyntaxKind::Break,
919        "continue" => SyntaxKind::Continue,
920        "return" => SyntaxKind::Return,
921        "import" => SyntaxKind::Import,
922        "include" => SyntaxKind::Include,
923        "as" => SyntaxKind::As,
924        _ => return None,
925    })
926}
927
928trait ScannerExt {
929    fn advance(&mut self, by: usize);
930    fn eat_newline(&mut self) -> bool;
931}
932
933impl ScannerExt for Scanner<'_> {
934    fn advance(&mut self, by: usize) {
935        self.jump(self.cursor() + by);
936    }
937
938    fn eat_newline(&mut self) -> bool {
939        let ate = self.eat_if(is_newline);
940        if ate && self.before().ends_with('\r') {
941            self.eat_if('\n');
942        }
943        ate
944    }
945}
946
947/// Whether a character will become a [`SyntaxKind::Space`] token.
948#[inline]
949fn is_space(character: char, mode: SyntaxMode) -> bool {
950    match mode {
951        SyntaxMode::Markup => matches!(character, ' ' | '\t') || is_newline(character),
952        _ => character.is_whitespace(),
953    }
954}
955
956/// Whether a character is interpreted as a newline by Typst.
957#[inline]
958pub fn is_newline(character: char) -> bool {
959    matches!(
960        character,
961        // Line Feed, Vertical Tab, Form Feed, Carriage Return.
962        '\n' | '\x0B' | '\x0C' | '\r' |
963        // Next Line, Line Separator, Paragraph Separator.
964        '\u{0085}' | '\u{2028}' | '\u{2029}'
965    )
966}
967
968/// Extracts a prefix of the text that is a link and also returns whether the
969/// parentheses and brackets in the link were balanced.
970pub fn link_prefix(text: &str) -> (&str, bool) {
971    let mut s = unscanny::Scanner::new(text);
972    let mut brackets = Vec::new();
973
974    #[rustfmt::skip]
975    s.eat_while(|c: char| {
976        match c {
977            | '0' ..= '9'
978            | 'a' ..= 'z'
979            | 'A' ..= 'Z'
980            | '!' | '#' | '$' | '%' | '&' | '*' | '+'
981            | ',' | '-' | '.' | '/' | ':' | ';' | '='
982            | '?' | '@' | '_' | '~' | '\'' => true,
983            '[' => {
984                brackets.push(b'[');
985                true
986            }
987            '(' => {
988                brackets.push(b'(');
989                true
990            }
991            ']' => brackets.pop() == Some(b'['),
992            ')' => brackets.pop() == Some(b'('),
993            _ => false,
994        }
995    });
996
997    // Don't include the trailing characters likely to be part of text.
998    while matches!(s.scout(-1), Some('!' | ',' | '.' | ':' | ';' | '?' | '\'')) {
999        s.uneat();
1000    }
1001
1002    (s.before(), brackets.is_empty())
1003}
1004
1005/// Split text at newlines. These newline characters are not kept.
1006pub fn split_newlines(text: &str) -> Vec<&str> {
1007    let mut s = Scanner::new(text);
1008    let mut lines = Vec::new();
1009    let mut start = 0;
1010    let mut end = 0;
1011
1012    while let Some(c) = s.eat() {
1013        if is_newline(c) {
1014            if c == '\r' {
1015                s.eat_if('\n');
1016            }
1017
1018            lines.push(&text[start..end]);
1019            start = s.cursor();
1020        }
1021        end = s.cursor();
1022    }
1023
1024    lines.push(&text[start..]);
1025    lines
1026}
1027
1028/// Count the number of newlines in text.
1029fn count_newlines(text: &str) -> usize {
1030    let mut newlines = 0;
1031    let mut s = Scanner::new(text);
1032    while let Some(c) = s.eat() {
1033        if is_newline(c) {
1034            if c == '\r' {
1035                s.eat_if('\n');
1036            }
1037            newlines += 1;
1038        }
1039    }
1040    newlines
1041}
1042
1043/// Whether a string is a valid Typst identifier.
1044///
1045/// In addition to what is specified in the [Unicode Standard][uax31], we allow:
1046/// - `_` as a starting character,
1047/// - `_` and `-` as continuing characters.
1048///
1049/// [uax31]: http://www.unicode.org/reports/tr31/
1050#[inline]
1051pub fn is_ident(string: &str) -> bool {
1052    let mut chars = string.chars();
1053    chars
1054        .next()
1055        .is_some_and(|c| is_id_start(c) && chars.all(is_id_continue))
1056}
1057
1058/// Whether a character can start an identifier.
1059#[inline]
1060pub fn is_id_start(c: char) -> bool {
1061    is_xid_start(c) || c == '_'
1062}
1063
1064/// Whether a character can continue an identifier.
1065#[inline]
1066pub fn is_id_continue(c: char) -> bool {
1067    is_xid_continue(c) || c == '_' || c == '-'
1068}
1069
1070/// Whether a character can start an identifier in math.
1071#[inline]
1072fn is_math_id_start(c: char) -> bool {
1073    is_xid_start(c)
1074}
1075
1076/// Whether a character can continue an identifier in math.
1077#[inline]
1078fn is_math_id_continue(c: char) -> bool {
1079    is_xid_continue(c) && c != '_'
1080}
1081
1082/// Whether a character can be part of a label literal's name.
1083#[inline]
1084fn is_valid_in_label_literal(c: char) -> bool {
1085    is_id_continue(c) || matches!(c, ':' | '.')
1086}
1087
1088/// Returns true if this string is valid in a label literal.
1089pub fn is_valid_label_literal_id(id: &str) -> bool {
1090    !id.is_empty() && id.chars().all(is_valid_in_label_literal)
1091}
typst_syntax/lexer.rs

typst_syntax/
lexer.rs