1use crate::parser::inlines::bookdown::try_parse_bookdown_equation_definition;
27use crate::syntax::SyntaxKind;
28use rowan::{GreenNode, GreenNodeBuilder};
29
30#[derive(Debug, Clone, PartialEq, Eq)]
34pub struct MathDiagnostic {
35 pub code: &'static str,
36 pub message: &'static str,
37 pub byte_start: usize,
38 pub byte_end: usize,
39}
40
41#[derive(Debug, Clone)]
43pub struct MathParseReport {
44 pub green: GreenNode,
45 pub diagnostics: Vec<MathDiagnostic>,
46}
47
48pub mod diagnostic_codes {
50 pub const UNCLOSED_GROUP: &str = "MATH_UNCLOSED_GROUP";
52 pub const UNEXPECTED_CLOSE_BRACE: &str = "MATH_UNEXPECTED_CLOSE_BRACE";
54 pub const UNCLOSED_ENVIRONMENT: &str = "MATH_UNCLOSED_ENVIRONMENT";
56 pub const MISMATCHED_ENVIRONMENT: &str = "MATH_MISMATCHED_ENVIRONMENT";
58 pub const UNEXPECTED_END: &str = "MATH_UNEXPECTED_END";
60}
61
62#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
66pub struct MathParseOptions {
67 pub bookdown_equation_labels: bool,
71}
72
73pub fn parse_math_content(content: &str, opts: MathParseOptions) -> GreenNode {
77 parse_math_report(content, opts).green
78}
79
80pub fn parse_math_report(content: &str, opts: MathParseOptions) -> MathParseReport {
82 let mut parser = MathParser {
83 input: content,
84 pos: 0,
85 builder: GreenNodeBuilder::new(),
86 diagnostics: Vec::new(),
87 opts,
88 };
89 parser.builder.start_node(SyntaxKind::MATH_CONTENT.into());
90 parser.parse_elements(Ctx::Top);
91 parser.builder.finish_node();
92 MathParseReport {
93 green: parser.builder.finish(),
94 diagnostics: parser.diagnostics,
95 }
96}
97
98#[derive(Debug, Clone, Copy, PartialEq, Eq)]
100enum Ctx {
101 Top,
103 Group,
105 Env,
107}
108
109struct MathParser<'a> {
110 input: &'a str,
111 pos: usize,
112 builder: GreenNodeBuilder<'static>,
113 diagnostics: Vec<MathDiagnostic>,
114 opts: MathParseOptions,
115}
116
117impl MathParser<'_> {
118 fn rest(&self) -> &str {
119 &self.input[self.pos..]
120 }
121
122 fn peek_char(&self) -> Option<char> {
123 self.rest().chars().next()
124 }
125
126 fn diagnose(&mut self, code: &'static str, message: &'static str, start: usize, end: usize) {
127 self.diagnostics.push(MathDiagnostic {
128 code,
129 message,
130 byte_start: start,
131 byte_end: end,
132 });
133 }
134
135 fn bump_bytes(&mut self, len: usize, kind: SyntaxKind) {
137 let text = &self.input[self.pos..self.pos + len];
138 self.builder.token(kind.into(), text);
139 self.pos += len;
140 }
141
142 fn peek_control_word(&self) -> Option<&str> {
146 let after = self.rest().strip_prefix('\\')?;
147 let len: usize = after
148 .bytes()
149 .take_while(|b| b.is_ascii_alphabetic() || *b == b'@')
150 .count();
151 if len == 0 { None } else { Some(&after[..len]) }
152 }
153
154 fn parse_elements(&mut self, ctx: Ctx) {
155 while let Some(c) = self.peek_char() {
156 match c {
157 '}' if ctx == Ctx::Group => break,
158 '}' => {
161 self.diagnose(
162 diagnostic_codes::UNEXPECTED_CLOSE_BRACE,
163 "unmatched closing brace `}`",
164 self.pos,
165 self.pos + 1,
166 );
167 self.bump_bytes(1, SyntaxKind::MATH_GROUP_CLOSE);
168 }
169 '\\' => {
170 if self.rest().starts_with("\\\\") {
171 self.bump_bytes(2, SyntaxKind::MATH_LINE_BREAK);
172 } else if let Some(word) = self.peek_control_word() {
173 match word {
174 "begin" => self.parse_environment(),
175 "end" if ctx == Ctx::Env => break,
176 "end" => {
177 self.diagnose(
179 diagnostic_codes::UNEXPECTED_END,
180 "`\\end` without a matching `\\begin`",
181 self.pos,
182 self.pos + 1 + word.len(),
183 );
184 self.parse_control_word();
185 }
186 _ => self.parse_control_word(),
187 }
188 } else {
189 self.parse_control_symbol();
190 }
191 }
192 '{' => self.parse_group(),
193 '(' if self.opts.bookdown_equation_labels => match self.equation_label_len() {
196 Some(len) => self.bump_bytes(len, SyntaxKind::MATH_EQUATION_LABEL),
197 None => self.bump_bytes(1, SyntaxKind::MATH_OPEN),
198 },
199 '(' | '[' => self.bump_bytes(1, SyntaxKind::MATH_OPEN),
203 ')' | ']' => self.bump_bytes(1, SyntaxKind::MATH_CLOSE),
204 ',' | ';' => self.bump_bytes(1, SyntaxKind::MATH_PUNCT),
205 '&' => self.bump_bytes(1, SyntaxKind::MATH_ALIGN),
206 '^' | '_' => self.bump_bytes(1, SyntaxKind::MATH_SCRIPT),
207 c if is_operator(c) => self.bump_bytes(1, SyntaxKind::MATH_OPERATOR),
212 '%' => self.parse_comment(),
213 ' ' | '\t' => self.parse_spaces(),
214 '\n' => self.bump_bytes(1, SyntaxKind::MATH_NEWLINE),
215 '\r' => {
216 let len = if self.rest().starts_with("\r\n") {
217 2
218 } else {
219 1
220 };
221 self.bump_bytes(len, SyntaxKind::MATH_NEWLINE);
222 }
223 _ => self.parse_text(),
224 }
225 }
226 }
227
228 fn parse_environment(&mut self) {
232 let begin_start = self.pos;
233 self.builder.start_node(SyntaxKind::MATH_ENVIRONMENT.into());
234 self.parse_control_word(); let begin_name = self.parse_environment_name();
236 self.parse_elements(Ctx::Env);
237 if self.peek_control_word() == Some("end") {
238 let end_start = self.pos;
239 self.parse_control_word(); let end_name = self.parse_environment_name();
241 if begin_name != end_name {
242 self.diagnose(
243 diagnostic_codes::MISMATCHED_ENVIRONMENT,
244 "`\\end` name does not match the open `\\begin`",
245 end_start,
246 self.pos,
247 );
248 }
249 } else {
250 self.diagnose(
251 diagnostic_codes::UNCLOSED_ENVIRONMENT,
252 "`\\begin` without a matching `\\end`",
253 begin_start,
254 self.pos,
255 );
256 }
257 self.builder.finish_node();
258 }
259
260 fn parse_environment_name(&mut self) -> String {
263 if self.peek_char() != Some('{') {
264 return String::new();
265 }
266 let open = self.pos;
267 self.parse_group();
268 self.input[open..self.pos]
270 .trim_start_matches('{')
271 .trim_end_matches('}')
272 .to_string()
273 }
274
275 fn parse_group(&mut self) {
276 let open = self.pos;
277 self.builder.start_node(SyntaxKind::MATH_GROUP.into());
278 self.bump_bytes(1, SyntaxKind::MATH_GROUP_OPEN); self.parse_elements(Ctx::Group);
280 if self.peek_char() == Some('}') {
281 self.bump_bytes(1, SyntaxKind::MATH_GROUP_CLOSE); } else {
283 self.diagnose(
284 diagnostic_codes::UNCLOSED_GROUP,
285 "unclosed `{` group",
286 open,
287 open + 1,
288 );
289 }
290 self.builder.finish_node();
291 }
292
293 fn parse_control_word(&mut self) {
295 let word_len = self.peek_control_word().map(str::len).unwrap_or(0);
296 self.bump_bytes(1 + word_len, SyntaxKind::MATH_COMMAND);
297 }
298
299 fn parse_control_symbol(&mut self) {
302 let after = &self.input[self.pos + 1..];
303 let len = 1 + after.chars().next().map(char::len_utf8).unwrap_or(0);
304 self.bump_bytes(len, SyntaxKind::MATH_COMMAND);
305 }
306
307 fn parse_comment(&mut self) {
309 let len = self
310 .rest()
311 .find(['\n', '\r'])
312 .unwrap_or_else(|| self.rest().len());
313 self.bump_bytes(len, SyntaxKind::MATH_COMMENT);
314 }
315
316 fn parse_spaces(&mut self) {
317 let len = self
318 .rest()
319 .bytes()
320 .take_while(|&b| b == b' ' || b == b'\t')
321 .count();
322 self.bump_bytes(len, SyntaxKind::MATH_SPACE);
323 }
324
325 fn parse_text(&mut self) {
330 let len = self
331 .rest()
332 .find(|c: char| is_special(c))
333 .unwrap_or_else(|| self.rest().len());
334 debug_assert!(len > 0, "parse_text on a special char");
335 self.bump_bytes(len, SyntaxKind::MATH_TEXT);
336 }
337
338 fn equation_label_len(&self) -> Option<usize> {
342 try_parse_bookdown_equation_definition(self.rest()).map(|(len, _)| len)
343 }
344}
345
346fn is_special(c: char) -> bool {
348 is_operator(c)
349 || is_delimiter(c)
350 || matches!(
351 c,
352 '\\' | '{' | '}' | '&' | '^' | '_' | '%' | ' ' | '\t' | '\n' | '\r'
353 )
354}
355
356fn is_delimiter(c: char) -> bool {
361 matches!(c, '(' | ')' | '[' | ']' | ',' | ';')
362}
363
364fn is_operator(c: char) -> bool {
368 matches!(c, '+' | '-' | '*' | '=' | '<' | '>')
369}
370
371#[cfg(test)]
372mod tests {
373 use super::*;
374 use crate::syntax::SyntaxNode;
375
376 fn node(content: &str) -> SyntaxNode {
377 SyntaxNode::new_root(parse_math_content(content, MathParseOptions::default()))
378 }
379
380 fn node_with(content: &str, opts: MathParseOptions) -> SyntaxNode {
381 SyntaxNode::new_root(parse_math_content(content, opts))
382 }
383
384 fn token_kinds(content: &str) -> Vec<SyntaxKind> {
385 node(content)
386 .descendants_with_tokens()
387 .filter_map(|el| el.into_token())
388 .map(|tok| tok.kind())
389 .collect()
390 }
391
392 fn codes(content: &str) -> Vec<&'static str> {
393 parse_math_report(content, MathParseOptions::default())
394 .diagnostics
395 .into_iter()
396 .map(|d| d.code)
397 .collect()
398 }
399
400 fn assert_lossless(content: &str) {
402 assert_eq!(
403 node(content).text().to_string(),
404 content,
405 "roundtrip: {content:?}"
406 );
407 }
408
409 #[test]
410 fn root_is_math_content() {
411 assert_eq!(node("x").kind(), SyntaxKind::MATH_CONTENT);
412 }
413
414 #[test]
415 fn plain_text_is_one_atom_run() {
416 assert_eq!(token_kinds("abc"), vec![SyntaxKind::MATH_TEXT]);
418 assert_lossless("abc");
419 assert_eq!(
422 token_kinds("f(x)/2.5"),
423 vec![
424 SyntaxKind::MATH_TEXT, SyntaxKind::MATH_OPEN, SyntaxKind::MATH_TEXT, SyntaxKind::MATH_CLOSE, SyntaxKind::MATH_TEXT, ]
430 );
431 assert_lossless("f(x)/2.5");
432 }
433
434 #[test]
435 fn delimiters_and_punctuation_split_atom_runs() {
436 assert_eq!(
439 token_kinds("[a,b);"),
440 vec![
441 SyntaxKind::MATH_OPEN, SyntaxKind::MATH_TEXT, SyntaxKind::MATH_PUNCT, SyntaxKind::MATH_TEXT, SyntaxKind::MATH_CLOSE, SyntaxKind::MATH_PUNCT, ]
448 );
449 assert_lossless("[a,b);");
450 assert_eq!(token_kinds("a|b.c/d"), vec![SyntaxKind::MATH_TEXT]);
452 assert_lossless("a|b.c/d");
453 assert_eq!(token_kinds(r"\(\)\[\]"), vec![SyntaxKind::MATH_COMMAND; 4]);
455 assert_lossless(r"\(\)\[\]");
456 }
457
458 #[test]
459 fn operators_split_atom_runs() {
460 assert_eq!(
463 token_kinds("a+b=c"),
464 vec![
465 SyntaxKind::MATH_TEXT, SyntaxKind::MATH_OPERATOR, SyntaxKind::MATH_TEXT, SyntaxKind::MATH_OPERATOR, SyntaxKind::MATH_TEXT, ]
471 );
472 assert_lossless("a+b=c");
473 }
474
475 #[test]
476 fn each_operator_char_is_its_own_token() {
477 for op in ["+", "-", "*", "=", "<", ">"] {
478 assert_eq!(
479 token_kinds(op),
480 vec![SyntaxKind::MATH_OPERATOR],
481 "operator {op:?}"
482 );
483 assert_lossless(op);
484 }
485 assert_eq!(
487 token_kinds("a<=b"),
488 vec![
489 SyntaxKind::MATH_TEXT,
490 SyntaxKind::MATH_OPERATOR, SyntaxKind::MATH_OPERATOR, SyntaxKind::MATH_TEXT,
493 ]
494 );
495 assert_eq!(
497 token_kinds("-x"),
498 vec![SyntaxKind::MATH_OPERATOR, SyntaxKind::MATH_TEXT]
499 );
500 assert_lossless("-x");
501 assert_eq!(token_kinds(r"\<"), vec![SyntaxKind::MATH_COMMAND]);
503 assert_lossless(r"\<");
504 }
505
506 #[test]
507 fn operators_inside_groups_and_scripts_are_lossless() {
508 for content in [r"e^{-x}", r"10^{-3}", r"\frac{a+b}{c-d}", r"x_{i+1}"] {
509 assert_lossless(content);
510 }
511 }
512
513 #[test]
514 fn control_word_and_symbol() {
515 assert_eq!(
516 token_kinds(r"\alpha\,"),
517 vec![SyntaxKind::MATH_COMMAND, SyntaxKind::MATH_COMMAND]
518 );
519 assert_lossless(r"\alpha\,");
520 assert_eq!(token_kinds(r"\&\%\{\}"), vec![SyntaxKind::MATH_COMMAND; 4]);
522 assert_lossless(r"\&\%\{\}");
523 }
524
525 #[test]
526 fn brace_group_nests() {
527 let tree = node(r"x^{2}");
528 let group = tree
529 .descendants()
530 .find(|n| n.kind() == SyntaxKind::MATH_GROUP)
531 .expect("group");
532 let kinds: Vec<_> = group.children_with_tokens().map(|el| el.kind()).collect();
533 assert_eq!(
534 kinds,
535 vec![
536 SyntaxKind::MATH_GROUP_OPEN,
537 SyntaxKind::MATH_TEXT,
538 SyntaxKind::MATH_GROUP_CLOSE
539 ]
540 );
541 assert_lossless(r"x^{2}");
542 }
543
544 #[test]
545 fn line_break_alignment_and_scripts() {
546 assert_eq!(
547 token_kinds(r"x &= 1 \\"),
548 vec![
549 SyntaxKind::MATH_TEXT, SyntaxKind::MATH_SPACE, SyntaxKind::MATH_ALIGN, SyntaxKind::MATH_OPERATOR, SyntaxKind::MATH_SPACE, SyntaxKind::MATH_TEXT, SyntaxKind::MATH_SPACE, SyntaxKind::MATH_LINE_BREAK, ]
558 );
559 assert_lossless(r"x &= 1 \\");
560 assert_eq!(
561 token_kinds("x^2_i"),
562 vec![
563 SyntaxKind::MATH_TEXT,
564 SyntaxKind::MATH_SCRIPT,
565 SyntaxKind::MATH_TEXT,
566 SyntaxKind::MATH_SCRIPT,
567 SyntaxKind::MATH_TEXT,
568 ]
569 );
570 }
571
572 #[test]
573 fn environment_wraps_body() {
574 let content = "\\begin{aligned}\nx &= 1\n\\end{aligned}";
575 let tree = node(content);
576 let env = tree
577 .descendants()
578 .find(|n| n.kind() == SyntaxKind::MATH_ENVIRONMENT)
579 .expect("environment");
580 assert_eq!(env.text().to_string(), content);
581 let commands = env
582 .children_with_tokens()
583 .filter(|el| el.kind() == SyntaxKind::MATH_COMMAND)
584 .count();
585 assert_eq!(commands, 2);
586 assert_lossless(content);
587 assert!(
588 codes(content).is_empty(),
589 "well-formed env has no diagnostics"
590 );
591 }
592
593 #[test]
594 fn nested_environments() {
595 let content = r"\begin{a}\begin{b}x\end{b}\end{a}";
596 let envs = node(content)
597 .descendants()
598 .filter(|n| n.kind() == SyntaxKind::MATH_ENVIRONMENT)
599 .count();
600 assert_eq!(envs, 2);
601 assert_lossless(content);
602 assert!(codes(content).is_empty());
603 }
604
605 #[test]
606 fn comment_runs_to_end_of_line() {
607 assert_eq!(
608 token_kinds("a % tail\nb"),
609 vec![
610 SyntaxKind::MATH_TEXT,
611 SyntaxKind::MATH_SPACE,
612 SyntaxKind::MATH_COMMENT,
613 SyntaxKind::MATH_NEWLINE,
614 SyntaxKind::MATH_TEXT,
615 ]
616 );
617 assert_lossless("a % tail\nb");
618 }
619
620 #[test]
621 fn crlf_and_unicode_are_lossless() {
622 assert_lossless("x &= 1\r\ny &= 2\r\n");
623 assert_lossless(r"\alpha + \beta \neq \gamma_{\text{αβγ}}");
624 }
625
626 #[test]
627 fn empty_content() {
628 assert_eq!(node("").text().to_string(), "");
629 assert!(token_kinds("").is_empty());
630 }
631
632 #[test]
633 fn trailing_backslash() {
634 assert_eq!(
635 token_kinds("a\\"),
636 vec![SyntaxKind::MATH_TEXT, SyntaxKind::MATH_COMMAND]
637 );
638 assert_lossless("a\\");
639 }
640
641 #[test]
644 fn unclosed_group_is_lossless_and_diagnosed() {
645 assert_lossless("{a");
646 assert_eq!(codes("{a"), vec![diagnostic_codes::UNCLOSED_GROUP]);
647 }
648
649 #[test]
650 fn stray_close_brace_is_lossless_and_diagnosed() {
651 assert_lossless("a}b");
652 assert_eq!(codes("a}b"), vec![diagnostic_codes::UNEXPECTED_CLOSE_BRACE]);
653 }
654
655 #[test]
656 fn unclosed_environment_is_diagnosed() {
657 let content = r"\begin{aligned} x &= 1";
658 assert_lossless(content);
659 assert_eq!(codes(content), vec![diagnostic_codes::UNCLOSED_ENVIRONMENT]);
660 }
661
662 #[test]
663 fn mismatched_environment_is_diagnosed() {
664 let content = r"\begin{aligned}x\end{matrix}";
665 assert_lossless(content);
666 assert_eq!(
667 codes(content),
668 vec![diagnostic_codes::MISMATCHED_ENVIRONMENT]
669 );
670 }
671
672 #[test]
673 fn stray_end_is_diagnosed() {
674 let content = r"x \end{aligned}";
675 assert_lossless(content);
676 assert_eq!(codes(content), vec![diagnostic_codes::UNEXPECTED_END]);
677 }
678
679 #[test]
680 fn well_formed_math_has_no_diagnostics() {
681 assert!(codes(r"\frac{1}{2} + x^{2}").is_empty());
682 }
683
684 const BOOKDOWN: MathParseOptions = MathParseOptions {
687 bookdown_equation_labels: true,
688 };
689
690 fn label_kinds(content: &str, opts: MathParseOptions) -> Vec<SyntaxKind> {
691 node_with(content, opts)
692 .descendants_with_tokens()
693 .filter_map(|el| el.into_token())
694 .map(|tok| tok.kind())
695 .collect()
696 }
697
698 #[test]
699 fn equation_label_recognized_when_enabled() {
700 let kinds = label_kinds(r"a (\#eq:foo)", BOOKDOWN);
701 assert!(kinds.contains(&SyntaxKind::MATH_EQUATION_LABEL));
702 let label = node_with(r"a (\#eq:foo)", BOOKDOWN)
704 .descendants_with_tokens()
705 .filter_map(|el| el.into_token())
706 .find(|t| t.kind() == SyntaxKind::MATH_EQUATION_LABEL)
707 .expect("label token");
708 assert_eq!(label.text(), r"(\#eq:foo)");
709 }
710
711 #[test]
712 fn equation_label_ignored_when_disabled() {
713 let kinds = label_kinds(r"a (\#eq:foo)", MathParseOptions::default());
715 assert!(!kinds.contains(&SyntaxKind::MATH_EQUATION_LABEL));
716 }
717
718 #[test]
719 fn plain_parens_tokenize_the_same_with_or_without_bookdown() {
720 let expected = vec![
724 SyntaxKind::MATH_TEXT, SyntaxKind::MATH_OPEN, SyntaxKind::MATH_TEXT, SyntaxKind::MATH_CLOSE, ];
729 assert_eq!(token_kinds("f(x)"), expected);
730 assert_eq!(label_kinds("f(x)", BOOKDOWN), expected);
731 }
732
733 #[test]
734 fn label_parsing_is_lossless() {
735 let content = "\\begin{align}\n a (\\#eq:solveG)\n\\end{align}";
736 assert_eq!(node_with(content, BOOKDOWN).text().to_string(), content);
737 }
738}