1use std::num::IntErrorKind;
2
3use ecow::{EcoString, EcoVec, eco_format, eco_vec};
4use typst_utils::default_math_class;
5use unicode_ident::{is_xid_continue, is_xid_start};
6use unicode_math_class::MathClass;
7use unicode_script::{Script, UnicodeScript};
8use unicode_segmentation::UnicodeSegmentation;
9use unscanny::Scanner;
10
11use crate::{SyntaxKind, SyntaxMode, SyntaxNode};
12
13#[derive(Clone)]
15pub(super) struct Lexer<'s> {
16 s: Scanner<'s>,
18 mode: SyntaxMode,
21 newline: bool,
23 error: Option<(EcoString, EcoVec<EcoString>)>,
26}
27
28impl<'s> Lexer<'s> {
29 pub fn new(text: &'s str, mode: SyntaxMode) -> Self {
32 Self {
33 s: Scanner::new(text),
34 mode,
35 newline: false,
36 error: None,
37 }
38 }
39
40 pub fn mode(&self) -> SyntaxMode {
42 self.mode
43 }
44
45 pub fn set_mode(&mut self, mode: SyntaxMode) {
47 self.mode = mode;
48 }
49
50 pub fn cursor(&self) -> usize {
53 self.s.cursor()
54 }
55
56 pub fn jump(&mut self, index: usize) {
58 self.s.jump(index);
59 }
60
61 pub fn newline(&self) -> bool {
63 self.newline
64 }
65
66 pub fn column(&self, index: usize) -> usize {
68 let mut s = self.s; s.jump(index);
70 s.before().chars().rev().take_while(|&c| !is_newline(c)).count()
71 }
72}
73
74impl Lexer<'_> {
75 fn error(&mut self, message: impl Into<EcoString>) -> SyntaxKind {
77 debug_assert!(self.error.is_none());
78 self.error = Some((message.into(), eco_vec![]));
79 SyntaxKind::Error
80 }
81
82 fn hint(&mut self, message: impl Into<EcoString>) {
84 if let Some((_message, hints)) = &mut self.error {
85 hints.push(message.into());
86 }
87 }
88}
89
90impl Lexer<'_> {
92 pub fn next(&mut self) -> (SyntaxKind, SyntaxNode) {
95 debug_assert!(self.error.is_none());
96 let start = self.s.cursor();
97
98 self.newline = false;
99 let kind = match self.s.eat() {
100 Some(c) if is_space(c, self.mode) => self.whitespace(start, c),
101 Some('#') if start == 0 && self.s.eat_if('!') => self.shebang(),
102 Some('/') if self.s.eat_if('/') => self.line_comment(),
103 Some('/') if self.s.eat_if('*') => self.block_comment(),
104 Some('*') if self.s.eat_if('/') => {
105 let error = self.error("unexpected end of block comment");
106 self.hint(
107 "consider escaping the `*` with a backslash or \
108 opening the block comment with `/*`",
109 );
110 error
111 }
112 Some('`') if self.mode != SyntaxMode::Math => return self.raw(),
113 Some(c) => match self.mode {
114 SyntaxMode::Markup => self.markup(start, c),
115 SyntaxMode::Math => match self.math(start, c) {
116 (kind, None) => kind,
117 (kind, Some(node)) => return (kind, node),
118 },
119 SyntaxMode::Code => self.code(start, c),
120 },
121
122 None => SyntaxKind::End,
123 };
124
125 let text = self.s.from(start);
126 let node = match self.error.take() {
127 Some((message, hints)) => SyntaxNode::error(message, text).with_hints(hints),
128 None => SyntaxNode::leaf(kind, text),
129 };
130 (kind, node)
131 }
132
133 fn whitespace(&mut self, start: usize, c: char) -> SyntaxKind {
135 let more = self.s.eat_while(|c| is_space(c, self.mode));
136 let newlines = match c {
137 ' ' if more.is_empty() => 0,
139 _ => count_newlines(self.s.from(start)),
140 };
141
142 self.newline = newlines > 0;
143 if self.mode == SyntaxMode::Markup && newlines >= 2 {
144 SyntaxKind::Parbreak
145 } else {
146 SyntaxKind::Space
147 }
148 }
149
150 fn shebang(&mut self) -> SyntaxKind {
151 self.s.eat_until(is_newline);
152 SyntaxKind::Shebang
153 }
154
155 fn line_comment(&mut self) -> SyntaxKind {
156 self.s.eat_until(is_newline);
157 SyntaxKind::LineComment
158 }
159
160 fn block_comment(&mut self) -> SyntaxKind {
161 let mut state = '_';
162 let mut depth = 1;
163
164 while let Some(c) = self.s.eat() {
166 state = match (state, c) {
167 ('*', '/') => {
168 depth -= 1;
169 if depth == 0 {
170 break;
171 }
172 '_'
173 }
174 ('/', '*') => {
175 depth += 1;
176 '_'
177 }
178 _ => c,
179 }
180 }
181
182 SyntaxKind::BlockComment
183 }
184}
185
186impl Lexer<'_> {
188 fn raw(&mut self) -> (SyntaxKind, SyntaxNode) {
191 let start = self.s.cursor() - 1;
192
193 let mut backticks = 1;
195 while self.s.eat_if('`') {
196 backticks += 1;
197 }
198
199 if backticks == 2 {
201 let nodes = vec![
202 SyntaxNode::leaf(SyntaxKind::RawDelim, "`"),
203 SyntaxNode::leaf(SyntaxKind::RawDelim, "`"),
204 ];
205 return (SyntaxKind::Raw, SyntaxNode::inner(SyntaxKind::Raw, nodes));
206 }
207
208 let mut found = 0;
210 while found < backticks {
211 match self.s.eat() {
212 Some('`') => found += 1,
213 Some(_) => found = 0,
214 None => {
215 let message = "unclosed raw text";
216 let error = SyntaxNode::error(message, self.s.from(start));
217 return (SyntaxKind::Error, error);
218 }
219 }
220 }
221 let end = self.s.cursor();
222
223 let mut inner = Scanner::new(self.s.get(start + backticks..end - backticks));
224 let inner_len = inner.string().len();
225
226 let delim = SyntaxNode::leaf(SyntaxKind::RawDelim, self.s.from(end - backticks));
228 let mut nodes = vec![delim.clone()];
229
230 let mut tag = None;
231 let mut diff_future_tag_len = None;
232 if delim.len() >= 3 {
233 (tag, diff_future_tag_len) = Self::raw_lang_tag(&mut inner);
234 if let Some(tag) = tag {
235 nodes.push(SyntaxNode::leaf(SyntaxKind::RawLang, tag));
236 }
237 Self::blocky_raw(&mut inner, &mut nodes);
238 } else {
239 Self::inline_raw(&mut inner, &mut nodes);
240 }
241
242 nodes.push(delim);
244
245 let mut raw = SyntaxNode::inner(SyntaxKind::Raw, nodes);
246
247 Self::add_raw_warnings(&mut raw, backticks, diff_future_tag_len, tag, inner_len);
248
249 (SyntaxKind::Raw, raw)
250 }
251
252 fn raw_lang_tag<'a>(s: &mut Scanner<'a>) -> (Option<&'a str>, Option<usize>) {
257 let start = s.cursor();
258 let future_tag = s.eat_until(|c: char| c.is_whitespace() || c == '`');
259 if future_tag.is_empty() {
260 return (None, None);
263 }
264 s.jump(start);
265 let tag = s.eat_if(is_id_start).then(|| {
266 s.eat_while(is_id_continue);
267 s.from(start)
268 });
269 let diff_future_tag_len = tag
270 .is_none_or(|tag| tag.len() != future_tag.len())
271 .then_some(future_tag.len());
272 (tag, diff_future_tag_len)
273 }
274
275 fn blocky_raw(s: &mut Scanner, nodes: &mut Vec<SyntaxNode>) {
304 let mut lines = split_newlines(s.after());
306
307 let dedent = lines
309 .iter()
310 .skip(1)
311 .filter(|line| !line.chars().all(char::is_whitespace))
312 .chain(lines.last())
314 .map(|line| line.chars().take_while(|c| c.is_whitespace()).count())
315 .min()
316 .unwrap_or(0);
317
318 if lines.last().is_some_and(|last| last.chars().all(char::is_whitespace)) {
321 lines.pop();
322 } else if let Some(last) = lines.last_mut() {
323 if last.trim_end().ends_with('`') {
327 *last = last.strip_suffix(' ').unwrap_or(last);
328 }
329 }
330
331 let mut prev = s.cursor();
334 let mut push_leaf = |kind, s: &Scanner| {
335 nodes.push(SyntaxNode::leaf(kind, s.from(prev)));
336 prev = s.cursor();
337 };
338
339 let mut lines = lines.into_iter();
340
341 if let Some(first_line) = lines.next() {
345 if first_line.chars().all(char::is_whitespace) {
346 s.advance(first_line.len());
347 debug_assert!(!s.done());
351 } else {
368 let line_end = s.cursor() + first_line.len();
369 if s.eat_if(' ') {
370 push_leaf(SyntaxKind::RawTrimmed, s);
373 }
374 s.jump(line_end);
376 push_leaf(SyntaxKind::Text, s);
377 }
378 }
379
380 for line in lines {
382 let offset: usize = line.chars().take(dedent).map(char::len_utf8).sum();
383 s.eat_newline();
384 s.advance(offset);
385 push_leaf(SyntaxKind::RawTrimmed, s);
386 s.advance(line.len() - offset);
387 push_leaf(SyntaxKind::Text, s);
388 }
389
390 if !s.done() {
392 nodes.push(SyntaxNode::leaf(SyntaxKind::RawTrimmed, s.after()));
393 }
394 }
395
396 fn inline_raw(s: &mut Scanner, nodes: &mut Vec<SyntaxNode>) {
400 let mut prev = s.cursor();
401 while !s.done() {
402 if s.at(is_newline) {
403 nodes.push(SyntaxNode::leaf(SyntaxKind::Text, s.from(prev)));
404 prev = s.cursor();
405 s.eat_newline();
406 nodes.push(SyntaxNode::leaf(SyntaxKind::RawTrimmed, s.from(prev)));
407 prev = s.cursor();
408 continue;
409 }
410 s.eat();
411 }
412 nodes.push(SyntaxNode::leaf(SyntaxKind::Text, s.from(prev)));
413 }
414
415 fn add_raw_warnings(
432 raw: &mut SyntaxNode,
433 backticks: usize,
434 diff_future_tag_len: Option<usize>,
435 tag: Option<&str>,
436 inner_len: usize,
437 ) {
438 if let Some(future_tag_len) = diff_future_tag_len {
440 let future_range = backticks..backticks + future_tag_len;
441 if let Some(tag) = tag {
442 raw.warn_at(
443 future_range,
444 "no whitespace between language tag and raw text",
445 );
446 raw.hint(eco_format!(
447 "currently, Typst is treating `{tag}` as the language tag"
448 ));
449 raw.hint(
450 "in the next version of Typst, this will change and we will treat \
451 all text until the first whitespace as the language tag",
452 );
453 let tag_range = backticks..backticks + tag.len();
454 raw.hint_at(tag_range.clone(), eco_format!(
455 "if the current behavior is correct, please add a space after `{tag}`"
456 ));
457 raw.hint_at(
458 tag_range,
459 "otherwise, add a space or newline after the initial backticks",
460 );
461 } else {
462 raw.warn_at(future_range, "no whitespace before raw text");
463 raw.hint(
464 "in the next version of Typst, this text will be treated as \
465 the language tag for this element",
466 );
467 raw.hint("to avoid this, add a space after the initial backticks");
468 }
469 } else if let Some(tag) = tag
470 && inner_len == tag.len()
471 {
472 raw.warn("empty raw text");
475 raw.hint(eco_format!("Typst is treating `{tag}` as the language tag"));
476 let tag_range = backticks..backticks + tag.len();
477 raw.hint_at(
478 tag_range,
479 "to treat this as text, add a space after the initial backticks",
480 );
481 }
482 }
483}
484
485impl Lexer<'_> {
487 fn markup(&mut self, start: usize, c: char) -> SyntaxKind {
488 match c {
489 '\\' => self.backslash(),
490 'h' if self.s.eat_if("ttp://") => self.link(),
491 'h' if self.s.eat_if("ttps://") => self.link(),
492 '<' if self.s.at(is_id_continue) => self.label(),
493 '@' if self.s.at(is_id_continue) => self.ref_marker(),
494
495 '.' if self.s.eat_if("..") => SyntaxKind::Shorthand,
496 '-' if self.s.eat_if("--") => SyntaxKind::Shorthand,
497 '-' if self.s.eat_if('-') => SyntaxKind::Shorthand,
498 '-' if self.s.eat_if('?') => SyntaxKind::Shorthand,
499 '-' if self.s.at(char::is_numeric) => SyntaxKind::Shorthand,
500 '*' if !self.in_word() => SyntaxKind::Star,
501 '_' if !self.in_word() => SyntaxKind::Underscore,
502
503 '#' => SyntaxKind::Hash,
504 '[' => SyntaxKind::LeftBracket,
505 ']' => SyntaxKind::RightBracket,
506 '\'' => SyntaxKind::SmartQuote,
507 '"' => SyntaxKind::SmartQuote,
508 '$' => SyntaxKind::Dollar,
509 '~' => SyntaxKind::Shorthand,
510 ':' => SyntaxKind::Colon,
511 '=' => {
512 self.s.eat_while('=');
513 if self.space_or_end() { SyntaxKind::HeadingMarker } else { self.text() }
514 }
515 '-' if self.space_or_end() => SyntaxKind::ListMarker,
516 '+' if self.space_or_end() => SyntaxKind::EnumMarker,
517 '/' if self.space_or_end() => SyntaxKind::TermMarker,
518 '0'..='9' => self.numbering(start),
519
520 _ => self.text(),
521 }
522 }
523
524 fn backslash(&mut self) -> SyntaxKind {
525 if self.s.eat_if("u{") {
526 let hex = self.s.eat_while(char::is_ascii_alphanumeric);
527 if !self.s.eat_if('}') {
528 return self.error("unclosed Unicode escape sequence");
529 }
530
531 if u32::from_str_radix(hex, 16)
532 .ok()
533 .and_then(std::char::from_u32)
534 .is_none()
535 {
536 return self.error(eco_format!("invalid Unicode codepoint: {hex}"));
537 }
538
539 return SyntaxKind::Escape;
540 }
541
542 if self.s.done() || self.s.at(char::is_whitespace) {
543 SyntaxKind::Linebreak
544 } else {
545 self.s.eat();
546 SyntaxKind::Escape
547 }
548 }
549
550 fn link(&mut self) -> SyntaxKind {
551 let (link, balanced) = link_prefix(self.s.after());
552 self.s.advance(link.len());
553
554 if !balanced {
555 return self.error(
556 "automatic links cannot contain unbalanced brackets, \
557 use the `link` function instead",
558 );
559 }
560
561 SyntaxKind::Link
562 }
563
564 fn numbering(&mut self, start: usize) -> SyntaxKind {
565 self.s.eat_while(char::is_ascii_digit);
566
567 let read = self.s.from(start);
568 if self.s.eat_if('.') && self.space_or_end() && read.parse::<u64>().is_ok() {
569 return SyntaxKind::EnumMarker;
570 }
571
572 self.text()
573 }
574
575 fn ref_marker(&mut self) -> SyntaxKind {
576 self.s.eat_while(is_valid_in_label_literal);
577
578 while matches!(self.s.scout(-1), Some('.' | ':')) {
580 self.s.uneat();
581 }
582
583 SyntaxKind::RefMarker
584 }
585
586 fn label(&mut self) -> SyntaxKind {
587 let label = self.s.eat_while(is_valid_in_label_literal);
588 if label.is_empty() {
589 return self.error("label cannot be empty");
590 }
591
592 if !self.s.eat_if('>') {
593 return self.error("unclosed label");
594 }
595
596 SyntaxKind::Label
597 }
598
599 fn text(&mut self) -> SyntaxKind {
600 macro_rules! table {
601 ($(|$c:literal)*) => {
602 static TABLE: [bool; 128] = {
603 let mut t = [false; 128];
604 $(t[$c as usize] = true;)*
605 t
606 };
607 };
608 }
609
610 table! {
611 | ' ' | '\t' | '\n' | '\x0b' | '\x0c' | '\r' | '\\' | '/'
612 | '[' | ']' | '~' | '-' | '.' | '\'' | '"' | '*' | '_'
613 | ':' | 'h' | '`' | '$' | '<' | '>' | '@' | '#'
614 };
615
616 loop {
617 self.s.eat_until(|c: char| {
618 TABLE.get(c as usize).copied().unwrap_or_else(|| c.is_whitespace())
619 });
620
621 let mut s = self.s;
624 match s.eat() {
625 Some(' ') if s.at(char::is_alphanumeric) => {}
626 Some('/') if !s.at(['/', '*']) => {}
627 Some('-') if !s.at(['-', '?']) => {}
628 Some('.') if !s.at("..") => {}
629 Some('h') if !s.at("ttp://") && !s.at("ttps://") => {}
630 Some('@') if !s.at(is_valid_in_label_literal) => {}
631 _ => break,
632 }
633
634 self.s = s;
635 }
636
637 SyntaxKind::Text
638 }
639
640 fn in_word(&self) -> bool {
641 let wordy = |c: Option<char>| {
642 c.is_some_and(|c| {
643 c.is_alphanumeric()
644 && !matches!(
645 c.script(),
646 Script::Han
647 | Script::Hiragana
648 | Script::Katakana
649 | Script::Hangul
650 )
651 })
652 };
653 let prev = self.s.scout(-2);
654 let next = self.s.peek();
655 wordy(prev) && wordy(next)
656 }
657
658 fn space_or_end(&self) -> bool {
659 self.s.done()
660 || self.s.at(char::is_whitespace)
661 || self.s.at("//")
662 || self.s.at("/*")
663 }
664}
665
666impl Lexer<'_> {
668 fn math(&mut self, start: usize, c: char) -> (SyntaxKind, Option<SyntaxNode>) {
669 let kind = match c {
670 '\\' => self.backslash(),
671 '"' => self.string(),
672
673 '-' if self.s.eat_if(">>") => SyntaxKind::MathShorthand,
674 '-' if self.s.eat_if('>') => SyntaxKind::MathShorthand,
675 '-' if self.s.eat_if("->") => SyntaxKind::MathShorthand,
676 ':' if self.s.eat_if('=') => SyntaxKind::MathShorthand,
677 ':' if self.s.eat_if(":=") => SyntaxKind::MathShorthand,
678 '!' if self.s.eat_if('=') => SyntaxKind::MathShorthand,
679 '.' if self.s.eat_if("..") => SyntaxKind::MathShorthand,
680 '<' if self.s.eat_if("==>") => SyntaxKind::MathShorthand,
681 '<' if self.s.eat_if("-->") => SyntaxKind::MathShorthand,
682 '<' if self.s.eat_if("--") => SyntaxKind::MathShorthand,
683 '<' if self.s.eat_if("-<") => SyntaxKind::MathShorthand,
684 '<' if self.s.eat_if("->") => SyntaxKind::MathShorthand,
685 '<' if self.s.eat_if("<-") => SyntaxKind::MathShorthand,
686 '<' if self.s.eat_if("<<") => SyntaxKind::MathShorthand,
687 '<' if self.s.eat_if("=>") => SyntaxKind::MathShorthand,
688 '<' if self.s.eat_if("==") => SyntaxKind::MathShorthand,
689 '<' if self.s.eat_if("~~") => SyntaxKind::MathShorthand,
690 '<' if self.s.eat_if('=') => SyntaxKind::MathShorthand,
691 '<' if self.s.eat_if('<') => SyntaxKind::MathShorthand,
692 '<' if self.s.eat_if('-') => SyntaxKind::MathShorthand,
693 '<' if self.s.eat_if('~') => SyntaxKind::MathShorthand,
694 '>' if self.s.eat_if("->") => SyntaxKind::MathShorthand,
695 '>' if self.s.eat_if(">>") => SyntaxKind::MathShorthand,
696 '=' if self.s.eat_if("=>") => SyntaxKind::MathShorthand,
697 '=' if self.s.eat_if('>') => SyntaxKind::MathShorthand,
698 '=' if self.s.eat_if(':') => SyntaxKind::MathShorthand,
699 '>' if self.s.eat_if('=') => SyntaxKind::MathShorthand,
700 '>' if self.s.eat_if('>') => SyntaxKind::MathShorthand,
701 '|' if self.s.eat_if("->") => SyntaxKind::MathShorthand,
702 '|' if self.s.eat_if("=>") => SyntaxKind::MathShorthand,
703 '|' if self.s.eat_if('|') => SyntaxKind::MathShorthand,
704 '~' if self.s.eat_if("~>") => SyntaxKind::MathShorthand,
705 '~' if self.s.eat_if('>') => SyntaxKind::MathShorthand,
706 '*' | '-' | '~' => SyntaxKind::MathShorthand,
707
708 '.' => SyntaxKind::Dot,
709 ',' => SyntaxKind::Comma,
710 ';' => SyntaxKind::Semicolon,
711
712 '#' => SyntaxKind::Hash,
713 '_' => SyntaxKind::Underscore,
714 '$' => SyntaxKind::Dollar,
715 '/' => SyntaxKind::Slash,
716 '^' => SyntaxKind::Hat,
717 '&' => SyntaxKind::MathAlignPoint,
718 '√' | '∛' | '∜' => SyntaxKind::Root,
719 '!' => SyntaxKind::Bang,
720
721 '\'' => {
722 self.s.eat_while('\'');
723 SyntaxKind::MathPrimes
724 }
725
726 '(' => SyntaxKind::LeftParen,
729 ')' => SyntaxKind::RightParen,
730 '[' if self.s.eat_if('|') => SyntaxKind::LeftBrace,
733 '|' if self.s.eat_if(']') => SyntaxKind::RightBrace,
734 c if default_math_class(c) == Some(MathClass::Opening) => {
735 SyntaxKind::LeftBrace
736 }
737 c if default_math_class(c) == Some(MathClass::Closing) => {
738 SyntaxKind::RightBrace
739 }
740
741 c if is_math_id_start(c) && self.s.at(is_math_id_continue) => {
743 self.s.eat_while(is_math_id_continue);
744 let (last_index, _) =
745 self.s.from(start).grapheme_indices(true).next_back().unwrap();
746 if last_index == 0 {
747 SyntaxKind::MathText
749 } else {
750 let (kind, node) = self.math_ident_or_field(start);
751 return (kind, Some(node));
752 }
753 }
754
755 _ => self.math_text(start, c),
757 };
758 (kind, None)
759 }
760
761 fn math_ident_or_field(&mut self, start: usize) -> (SyntaxKind, SyntaxNode) {
763 let mut kind = SyntaxKind::MathIdent;
764 let mut node = SyntaxNode::leaf(kind, self.s.from(start));
765 while let Some(ident) = self.maybe_dot_ident() {
766 kind = SyntaxKind::MathFieldAccess;
767 let field_children = vec![
768 node,
769 SyntaxNode::leaf(SyntaxKind::Dot, '.'),
770 SyntaxNode::leaf(SyntaxKind::MathIdent, ident),
771 ];
772 node = SyntaxNode::inner(kind, field_children);
773 }
774 (kind, node)
775 }
776
777 fn maybe_dot_ident(&mut self) -> Option<&str> {
779 if self.s.scout(1).is_some_and(is_math_id_start) && self.s.eat_if('.') {
780 let ident_start = self.s.cursor();
781 self.s.eat();
782 self.s.eat_while(is_math_id_continue);
783 Some(self.s.from(ident_start))
784 } else {
785 None
786 }
787 }
788
789 fn math_text(&mut self, start: usize, c: char) -> SyntaxKind {
790 if c.is_numeric() {
792 self.s.eat_while(char::is_numeric);
793 let mut s = self.s;
794 if s.eat_if('.') && !s.eat_while(char::is_numeric).is_empty() {
795 self.s = s;
796 }
797 } else {
798 let len = self
799 .s
800 .get(start..self.s.string().len())
801 .graphemes(true)
802 .next()
803 .map_or(0, str::len);
804 self.s.jump(start + len);
805 }
806 SyntaxKind::MathText
807 }
808
809 pub fn maybe_math_named_arg(&mut self, start: usize) -> Option<SyntaxNode> {
811 let cursor = self.s.cursor();
812 self.s.jump(start);
813 if self.s.eat_if(is_id_start) {
814 self.s.eat_while(is_id_continue);
815 if self.s.at(':') && !self.s.at(":=") && !self.s.at("::=") {
818 let node = if self.s.from(start) != "_" {
820 SyntaxNode::leaf(SyntaxKind::Ident, self.s.from(start))
821 } else {
822 let message = "expected identifier, found underscore";
823 SyntaxNode::error(message, self.s.from(start))
824 };
825 return Some(node);
826 }
827 }
828 self.s.jump(cursor);
829 None
830 }
831
832 pub fn maybe_math_spread_arg(&mut self, start: usize) -> Option<SyntaxNode> {
834 let cursor = self.s.cursor();
835 self.s.jump(start);
836 if self.s.eat_if("..") {
837 if !self.space_or_end() && !self.s.at(['.', ',', ';', ')', '$']) {
842 let node = SyntaxNode::leaf(SyntaxKind::Dots, self.s.from(start));
843 return Some(node);
844 }
845 }
846 self.s.jump(cursor);
847 None
848 }
849}
850
851impl Lexer<'_> {
853 fn code(&mut self, start: usize, c: char) -> SyntaxKind {
854 match c {
855 '<' if self.s.at(is_id_continue) => self.label(),
856 '0'..='9' => self.number(start, c),
857 '.' if self.s.at(char::is_ascii_digit) => self.number(start, c),
858 '"' => self.string(),
859
860 '=' if self.s.eat_if('=') => SyntaxKind::EqEq,
861 '!' if self.s.eat_if('=') => SyntaxKind::ExclEq,
862 '<' if self.s.eat_if('=') => SyntaxKind::LtEq,
863 '>' if self.s.eat_if('=') => SyntaxKind::GtEq,
864 '+' if self.s.eat_if('=') => SyntaxKind::PlusEq,
865 '-' | '\u{2212}' if self.s.eat_if('=') => SyntaxKind::HyphEq,
866 '*' if self.s.eat_if('=') => SyntaxKind::StarEq,
867 '/' if self.s.eat_if('=') => SyntaxKind::SlashEq,
868 '.' if self.s.eat_if('.') => SyntaxKind::Dots,
869 '=' if self.s.eat_if('>') => SyntaxKind::Arrow,
870
871 '{' => SyntaxKind::LeftBrace,
872 '}' => SyntaxKind::RightBrace,
873 '[' => SyntaxKind::LeftBracket,
874 ']' => SyntaxKind::RightBracket,
875 '(' => SyntaxKind::LeftParen,
876 ')' => SyntaxKind::RightParen,
877 '$' => SyntaxKind::Dollar,
878 ',' => SyntaxKind::Comma,
879 ';' => SyntaxKind::Semicolon,
880 ':' => SyntaxKind::Colon,
881 '.' => SyntaxKind::Dot,
882 '+' => SyntaxKind::Plus,
883 '-' | '\u{2212}' => SyntaxKind::Minus,
884 '*' => SyntaxKind::Star,
885 '/' => SyntaxKind::Slash,
886 '=' => SyntaxKind::Eq,
887 '<' => SyntaxKind::Lt,
888 '>' => SyntaxKind::Gt,
889
890 c if is_id_start(c) => self.ident(start),
891
892 c => self.invalid_char_in_code(c),
893 }
894 }
895
896 fn invalid_char_in_code(&mut self, c: char) -> SyntaxKind {
899 let invalid_char = || eco_format!("the character `{c}` is not valid in code");
900 let invalid_str = |s: &str| eco_format!("`{s}` is not valid in code");
901 match c {
902 _ if self.s.scout(-2) == Some('#') => {
904 self.error(invalid_char());
905 self.hint("the preceding hash is causing this to parse in code mode");
909 self.hint("try escaping the preceding hash: `\\#`");
910 }
912 '#' => {
913 self.error(invalid_char());
914 self.hint("you are already in code mode");
915 self.hint("try removing the `#`");
916 }
917 '&' if self.s.eat_if('&') => {
918 self.error(invalid_str("&&"));
919 self.hint("in Typst, `and` is used for logical AND");
920 }
921 '|' if self.s.eat_if('|') => {
922 self.error(invalid_str("||"));
923 self.hint("in Typst, `or` is used for logical OR");
924 }
925 '!' => {
926 self.error(invalid_char());
927 self.hint("in Typst, `not` is used for negation");
928 self.hint("or did you mean to write `!=` for not-equal?");
929 }
930 '~' if self.s.eat_if('=') => {
931 self.error(invalid_str("~="));
932 self.hint("in Typst, `!=` is used for not-equal");
933 }
934 _ => {
935 self.error(invalid_char());
936 }
937 }
938 SyntaxKind::Error
939 }
940
941 fn ident(&mut self, start: usize) -> SyntaxKind {
942 self.s.eat_while(is_id_continue);
943 let ident = self.s.from(start);
944
945 let prev = self.s.get(0..start);
946 if (!prev.ends_with(['.', '@']) || prev.ends_with(".."))
947 && let Some(keyword) = keyword(ident)
948 {
949 return keyword;
950 }
951
952 if ident == "_" { SyntaxKind::Underscore } else { SyntaxKind::Ident }
953 }
954
955 fn number(&mut self, start: usize, first_c: char) -> SyntaxKind {
964 let base = match first_c {
966 '0' if self.s.eat_if('b') => 2,
967 '0' if self.s.eat_if('o') => 8,
968 '0' if self.s.eat_if('x') => 16,
969 _ => 10,
970 };
971
972 if base == 16 {
974 self.s.eat_while(char::is_ascii_alphanumeric);
975 } else {
976 self.s.eat_while(char::is_ascii_digit);
977 }
978
979 let mut is_float = false;
981 if base == 10 {
982 if first_c == '.' {
985 is_float = true; } else if !self.s.at("..")
987 && !self.s.scout(1).is_some_and(is_id_start)
988 && self.s.eat_if('.')
989 {
990 is_float = true;
991 self.s.eat_while(char::is_ascii_digit);
992 }
993
994 if !self.s.at("em") && self.s.eat_if(['e', 'E']) {
996 is_float = true;
997 self.s.eat_if(['+', '-']);
998 self.s.eat_while(char::is_ascii_digit);
999 }
1000 }
1001
1002 let number = self.s.from(start);
1003 let suffix = self.s.eat_while(|c: char| c.is_ascii_alphanumeric() || c == '%');
1004
1005 if base == 10
1007 && !is_float
1008 && let Err(e) = i64::from_str_radix(number, base)
1009 && matches!(e.kind(), IntErrorKind::PosOverflow | IntErrorKind::NegOverflow)
1010 && number.parse::<f64>().is_ok()
1011 {
1012 is_float = true;
1013 }
1014
1015 let mut suffix_result = match suffix {
1016 "" => Ok(None),
1017 "pt" | "mm" | "cm" | "in" | "deg" | "rad" | "em" | "fr" | "%" => Ok(Some(())),
1018 _ => Err(eco_format!("invalid number suffix: `{suffix}`")),
1019 };
1020
1021 let number_result = if is_float && number.parse::<f64>().is_err() {
1022 Err(eco_format!("invalid floating point number: `{number}`"))
1025 } else if base == 10 {
1026 Ok(())
1027 } else {
1028 let name = match base {
1029 2 => "binary",
1030 8 => "octal",
1031 16 => "hexadecimal",
1032 _ => unreachable!(),
1033 };
1034 match i64::from_str_radix(&number[2..], base) {
1036 Ok(_) if suffix.is_empty() => Ok(()),
1037 Ok(value) => {
1038 if suffix_result.is_ok() {
1039 suffix_result = Err(eco_format!(
1040 "try using a decimal number: `{value}{suffix}`"
1041 ));
1042 }
1043 Err(eco_format!("{name} numbers cannot have a suffix"))
1044 }
1045 Err(e) if *e.kind() == IntErrorKind::Empty => Err(eco_format!(
1046 "expected a{} {name} number",
1047 if base == 8 { "n" } else { "" },
1048 )),
1049 Err(_) => Err(eco_format!("invalid {name} number: `{number}`")),
1050 }
1051 };
1052
1053 match (number_result, suffix_result) {
1055 (Ok(()), Ok(None)) if is_float => SyntaxKind::Float,
1057 (Ok(()), Ok(None)) => SyntaxKind::Int,
1058 (Ok(()), Ok(Some(()))) => SyntaxKind::Numeric,
1059 (Err(number_err), Err(suffix_err)) => {
1061 let error = self.error(number_err);
1062 self.hint(suffix_err);
1063 error
1064 }
1065 (Ok(()), Err(msg)) | (Err(msg), Ok(_)) => self.error(msg),
1066 }
1067 }
1068
1069 fn string(&mut self) -> SyntaxKind {
1070 let mut escaped = false;
1071 self.s.eat_until(|c| {
1072 let stop = c == '"' && !escaped;
1073 escaped = c == '\\' && !escaped;
1074 stop
1075 });
1076
1077 if !self.s.eat_if('"') {
1078 return self.error("unclosed string");
1079 }
1080
1081 SyntaxKind::Str
1082 }
1083}
1084
1085fn keyword(ident: &str) -> Option<SyntaxKind> {
1087 Some(match ident {
1088 "none" => SyntaxKind::None,
1089 "auto" => SyntaxKind::Auto,
1090 "true" => SyntaxKind::Bool,
1091 "false" => SyntaxKind::Bool,
1092 "not" => SyntaxKind::Not,
1093 "and" => SyntaxKind::And,
1094 "or" => SyntaxKind::Or,
1095 "let" => SyntaxKind::Let,
1096 "set" => SyntaxKind::Set,
1097 "show" => SyntaxKind::Show,
1098 "context" => SyntaxKind::Context,
1099 "if" => SyntaxKind::If,
1100 "else" => SyntaxKind::Else,
1101 "for" => SyntaxKind::For,
1102 "in" => SyntaxKind::In,
1103 "while" => SyntaxKind::While,
1104 "break" => SyntaxKind::Break,
1105 "continue" => SyntaxKind::Continue,
1106 "return" => SyntaxKind::Return,
1107 "import" => SyntaxKind::Import,
1108 "include" => SyntaxKind::Include,
1109 "as" => SyntaxKind::As,
1110 _ => return None,
1111 })
1112}
1113
1114trait ScannerExt {
1115 fn advance(&mut self, by: usize);
1116 fn eat_newline(&mut self) -> bool;
1117}
1118
1119impl ScannerExt for Scanner<'_> {
1120 fn advance(&mut self, by: usize) {
1121 self.jump(self.cursor() + by);
1122 }
1123
1124 fn eat_newline(&mut self) -> bool {
1125 let ate = self.eat_if(is_newline);
1126 if ate && self.before().ends_with('\r') {
1127 self.eat_if('\n');
1128 }
1129 ate
1130 }
1131}
1132
1133#[inline]
1135fn is_space(character: char, mode: SyntaxMode) -> bool {
1136 match mode {
1137 SyntaxMode::Markup => matches!(character, ' ' | '\t') || is_newline(character),
1138 _ => character.is_whitespace(),
1139 }
1140}
1141
1142#[inline]
1144pub fn is_newline(character: char) -> bool {
1145 matches!(
1146 character,
1147 '\n' | '\x0B' | '\x0C' | '\r' |
1149 '\u{0085}' | '\u{2028}' | '\u{2029}'
1151 )
1152}
1153
1154pub fn link_prefix(text: &str) -> (&str, bool) {
1157 let mut s = unscanny::Scanner::new(text);
1158 let mut brackets = Vec::new();
1159
1160 #[rustfmt::skip]
1161 s.eat_while(|c: char| {
1162 match c {
1163 | '0' ..= '9'
1164 | 'a' ..= 'z'
1165 | 'A' ..= 'Z'
1166 | '!' | '#' | '$' | '%' | '&' | '*' | '+'
1167 | ',' | '-' | '.' | '/' | ':' | ';' | '='
1168 | '?' | '@' | '_' | '~' | '\'' => true,
1169 '[' => {
1170 brackets.push(b'[');
1171 true
1172 }
1173 '(' => {
1174 brackets.push(b'(');
1175 true
1176 }
1177 ']' => brackets.pop() == Some(b'['),
1178 ')' => brackets.pop() == Some(b'('),
1179 _ => false,
1180 }
1181 });
1182
1183 while matches!(s.scout(-1), Some('!' | ',' | '.' | ':' | ';' | '?' | '\'')) {
1185 s.uneat();
1186 }
1187
1188 (s.before(), brackets.is_empty())
1189}
1190
1191pub fn split_newlines(text: &str) -> Vec<&str> {
1193 let mut s = Scanner::new(text);
1194 let mut lines = Vec::new();
1195 let mut start = 0;
1196 let mut end = 0;
1197
1198 while let Some(c) = s.eat() {
1199 if is_newline(c) {
1200 if c == '\r' {
1201 s.eat_if('\n');
1202 }
1203
1204 lines.push(&text[start..end]);
1205 start = s.cursor();
1206 }
1207 end = s.cursor();
1208 }
1209
1210 lines.push(&text[start..]);
1211 lines
1212}
1213
1214fn count_newlines(text: &str) -> usize {
1216 let mut newlines = 0;
1217 let mut s = Scanner::new(text);
1218 while let Some(c) = s.eat() {
1219 if is_newline(c) {
1220 if c == '\r' {
1221 s.eat_if('\n');
1222 }
1223 newlines += 1;
1224 }
1225 }
1226 newlines
1227}
1228
1229#[inline]
1237pub fn is_ident(string: &str) -> bool {
1238 let mut chars = string.chars();
1239 chars
1240 .next()
1241 .is_some_and(|c| is_id_start(c) && chars.all(is_id_continue))
1242}
1243
1244#[inline]
1246pub fn is_id_start(c: char) -> bool {
1247 is_xid_start(c) || c == '_'
1248}
1249
1250#[inline]
1252pub fn is_id_continue(c: char) -> bool {
1253 is_xid_continue(c) || c == '_' || c == '-'
1254}
1255
1256#[inline]
1258fn is_math_id_start(c: char) -> bool {
1259 is_xid_start(c)
1260}
1261
1262#[inline]
1264fn is_math_id_continue(c: char) -> bool {
1265 is_xid_continue(c) && c != '_'
1266}
1267
1268#[inline]
1270fn is_valid_in_label_literal(c: char) -> bool {
1271 is_id_continue(c) || matches!(c, ':' | '.')
1272}
1273
1274pub fn is_valid_label_literal_id(id: &str) -> bool {
1276 !id.is_empty() && id.chars().all(is_valid_in_label_literal)
1277}