1use crate::parser::inlines::bookdown::try_parse_bookdown_equation_definition;
27use crate::syntax::SyntaxKind;
28use rowan::{GreenNode, GreenNodeBuilder};
29
30#[derive(Debug, Clone, PartialEq, Eq)]
34pub struct MathDiagnostic {
35 pub code: &'static str,
36 pub message: &'static str,
37 pub byte_start: usize,
38 pub byte_end: usize,
39}
40
41#[derive(Debug, Clone)]
43pub struct MathParseReport {
44 pub green: GreenNode,
45 pub diagnostics: Vec<MathDiagnostic>,
46}
47
48pub mod diagnostic_codes {
50 pub const UNCLOSED_GROUP: &str = "MATH_UNCLOSED_GROUP";
52 pub const UNEXPECTED_CLOSE_BRACE: &str = "MATH_UNEXPECTED_CLOSE_BRACE";
54 pub const UNCLOSED_ENVIRONMENT: &str = "MATH_UNCLOSED_ENVIRONMENT";
56 pub const MISMATCHED_ENVIRONMENT: &str = "MATH_MISMATCHED_ENVIRONMENT";
58 pub const UNEXPECTED_END: &str = "MATH_UNEXPECTED_END";
60}
61
62#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
66pub struct MathParseOptions {
67 pub bookdown_equation_labels: bool,
71}
72
73pub fn parse_math_content(content: &str, opts: MathParseOptions) -> GreenNode {
77 parse_math_report(content, opts).green
78}
79
80pub fn parse_math_report(content: &str, opts: MathParseOptions) -> MathParseReport {
82 let mut parser = MathParser {
83 input: content,
84 pos: 0,
85 builder: GreenNodeBuilder::new(),
86 diagnostics: Vec::new(),
87 opts,
88 };
89 parser.builder.start_node(SyntaxKind::MATH_CONTENT.into());
90 parser.parse_elements(Ctx::Top);
91 parser.builder.finish_node();
92 MathParseReport {
93 green: parser.builder.finish(),
94 diagnostics: parser.diagnostics,
95 }
96}
97
98#[derive(Debug, Clone, Copy, PartialEq, Eq)]
100enum Ctx {
101 Top,
103 Group,
105 Env,
107}
108
109struct MathParser<'a> {
110 input: &'a str,
111 pos: usize,
112 builder: GreenNodeBuilder<'static>,
113 diagnostics: Vec<MathDiagnostic>,
114 opts: MathParseOptions,
115}
116
117impl MathParser<'_> {
118 fn rest(&self) -> &str {
119 &self.input[self.pos..]
120 }
121
122 fn peek_char(&self) -> Option<char> {
123 self.rest().chars().next()
124 }
125
126 fn diagnose(&mut self, code: &'static str, message: &'static str, start: usize, end: usize) {
127 self.diagnostics.push(MathDiagnostic {
128 code,
129 message,
130 byte_start: start,
131 byte_end: end,
132 });
133 }
134
135 fn bump_bytes(&mut self, len: usize, kind: SyntaxKind) {
137 let text = &self.input[self.pos..self.pos + len];
138 self.builder.token(kind.into(), text);
139 self.pos += len;
140 }
141
142 fn peek_control_word(&self) -> Option<&str> {
146 let after = self.rest().strip_prefix('\\')?;
147 let len: usize = after
148 .bytes()
149 .take_while(|b| b.is_ascii_alphabetic() || *b == b'@')
150 .count();
151 if len == 0 { None } else { Some(&after[..len]) }
152 }
153
154 fn parse_elements(&mut self, ctx: Ctx) {
155 while let Some(c) = self.peek_char() {
156 match c {
157 '}' if ctx == Ctx::Group => break,
158 '}' => {
161 self.diagnose(
162 diagnostic_codes::UNEXPECTED_CLOSE_BRACE,
163 "unmatched closing brace `}`",
164 self.pos,
165 self.pos + 1,
166 );
167 self.bump_bytes(1, SyntaxKind::MATH_GROUP_CLOSE);
168 }
169 '\\' => {
170 if self.rest().starts_with("\\\\") {
171 self.bump_bytes(2, SyntaxKind::MATH_LINE_BREAK);
172 } else if let Some(word) = self.peek_control_word() {
173 match word {
174 "begin" => self.parse_environment(),
175 "end" if ctx == Ctx::Env => break,
176 "end" => {
177 self.diagnose(
179 diagnostic_codes::UNEXPECTED_END,
180 "`\\end` without a matching `\\begin`",
181 self.pos,
182 self.pos + 1 + word.len(),
183 );
184 self.parse_control_word();
185 }
186 _ => self.parse_control_word(),
187 }
188 } else {
189 self.parse_control_symbol();
190 }
191 }
192 '{' => self.parse_group(),
193 '(' if self.opts.bookdown_equation_labels => match self.equation_label_len() {
197 Some(len) => self.bump_bytes(len, SyntaxKind::MATH_EQUATION_LABEL),
198 None => self.bump_bytes(1, SyntaxKind::MATH_TEXT),
201 },
202 '&' => self.bump_bytes(1, SyntaxKind::MATH_ALIGN),
203 '^' | '_' => self.bump_bytes(1, SyntaxKind::MATH_SCRIPT),
204 c if is_operator(c) => self.bump_bytes(1, SyntaxKind::MATH_OPERATOR),
209 '%' => self.parse_comment(),
210 ' ' | '\t' => self.parse_spaces(),
211 '\n' => self.bump_bytes(1, SyntaxKind::MATH_NEWLINE),
212 '\r' => {
213 let len = if self.rest().starts_with("\r\n") {
214 2
215 } else {
216 1
217 };
218 self.bump_bytes(len, SyntaxKind::MATH_NEWLINE);
219 }
220 _ => self.parse_text(),
221 }
222 }
223 }
224
225 fn parse_environment(&mut self) {
229 let begin_start = self.pos;
230 self.builder.start_node(SyntaxKind::MATH_ENVIRONMENT.into());
231 self.parse_control_word(); let begin_name = self.parse_environment_name();
233 self.parse_elements(Ctx::Env);
234 if self.peek_control_word() == Some("end") {
235 let end_start = self.pos;
236 self.parse_control_word(); let end_name = self.parse_environment_name();
238 if begin_name != end_name {
239 self.diagnose(
240 diagnostic_codes::MISMATCHED_ENVIRONMENT,
241 "`\\end` name does not match the open `\\begin`",
242 end_start,
243 self.pos,
244 );
245 }
246 } else {
247 self.diagnose(
248 diagnostic_codes::UNCLOSED_ENVIRONMENT,
249 "`\\begin` without a matching `\\end`",
250 begin_start,
251 self.pos,
252 );
253 }
254 self.builder.finish_node();
255 }
256
257 fn parse_environment_name(&mut self) -> String {
260 if self.peek_char() != Some('{') {
261 return String::new();
262 }
263 let open = self.pos;
264 self.parse_group();
265 self.input[open..self.pos]
267 .trim_start_matches('{')
268 .trim_end_matches('}')
269 .to_string()
270 }
271
272 fn parse_group(&mut self) {
273 let open = self.pos;
274 self.builder.start_node(SyntaxKind::MATH_GROUP.into());
275 self.bump_bytes(1, SyntaxKind::MATH_GROUP_OPEN); self.parse_elements(Ctx::Group);
277 if self.peek_char() == Some('}') {
278 self.bump_bytes(1, SyntaxKind::MATH_GROUP_CLOSE); } else {
280 self.diagnose(
281 diagnostic_codes::UNCLOSED_GROUP,
282 "unclosed `{` group",
283 open,
284 open + 1,
285 );
286 }
287 self.builder.finish_node();
288 }
289
290 fn parse_control_word(&mut self) {
292 let word_len = self.peek_control_word().map(str::len).unwrap_or(0);
293 self.bump_bytes(1 + word_len, SyntaxKind::MATH_COMMAND);
294 }
295
296 fn parse_control_symbol(&mut self) {
299 let after = &self.input[self.pos + 1..];
300 let len = 1 + after.chars().next().map(char::len_utf8).unwrap_or(0);
301 self.bump_bytes(len, SyntaxKind::MATH_COMMAND);
302 }
303
304 fn parse_comment(&mut self) {
306 let len = self
307 .rest()
308 .find(['\n', '\r'])
309 .unwrap_or_else(|| self.rest().len());
310 self.bump_bytes(len, SyntaxKind::MATH_COMMENT);
311 }
312
313 fn parse_spaces(&mut self) {
314 let len = self
315 .rest()
316 .bytes()
317 .take_while(|&b| b == b' ' || b == b'\t')
318 .count();
319 self.bump_bytes(len, SyntaxKind::MATH_SPACE);
320 }
321
322 fn parse_text(&mut self) {
326 let bookdown = self.opts.bookdown_equation_labels;
327 let len = self
328 .rest()
329 .find(|c: char| is_special(c) || (bookdown && c == '('))
330 .unwrap_or_else(|| self.rest().len());
331 debug_assert!(len > 0, "parse_text on a special char");
332 self.bump_bytes(len, SyntaxKind::MATH_TEXT);
333 }
334
335 fn equation_label_len(&self) -> Option<usize> {
339 try_parse_bookdown_equation_definition(self.rest()).map(|(len, _)| len)
340 }
341}
342
343fn is_special(c: char) -> bool {
345 is_operator(c)
346 || matches!(
347 c,
348 '\\' | '{' | '}' | '&' | '^' | '_' | '%' | ' ' | '\t' | '\n' | '\r'
349 )
350}
351
352fn is_operator(c: char) -> bool {
356 matches!(c, '+' | '-' | '*' | '=' | '<' | '>')
357}
358
359#[cfg(test)]
360mod tests {
361 use super::*;
362 use crate::syntax::SyntaxNode;
363
364 fn node(content: &str) -> SyntaxNode {
365 SyntaxNode::new_root(parse_math_content(content, MathParseOptions::default()))
366 }
367
368 fn node_with(content: &str, opts: MathParseOptions) -> SyntaxNode {
369 SyntaxNode::new_root(parse_math_content(content, opts))
370 }
371
372 fn token_kinds(content: &str) -> Vec<SyntaxKind> {
373 node(content)
374 .descendants_with_tokens()
375 .filter_map(|el| el.into_token())
376 .map(|tok| tok.kind())
377 .collect()
378 }
379
380 fn codes(content: &str) -> Vec<&'static str> {
381 parse_math_report(content, MathParseOptions::default())
382 .diagnostics
383 .into_iter()
384 .map(|d| d.code)
385 .collect()
386 }
387
388 fn assert_lossless(content: &str) {
390 assert_eq!(
391 node(content).text().to_string(),
392 content,
393 "roundtrip: {content:?}"
394 );
395 }
396
397 #[test]
398 fn root_is_math_content() {
399 assert_eq!(node("x").kind(), SyntaxKind::MATH_CONTENT);
400 }
401
402 #[test]
403 fn plain_text_is_one_atom_run() {
404 assert_eq!(token_kinds("abc"), vec![SyntaxKind::MATH_TEXT]);
406 assert_lossless("abc");
407 assert_eq!(token_kinds("f(x)/2.5"), vec![SyntaxKind::MATH_TEXT]);
409 assert_lossless("f(x)/2.5");
410 }
411
412 #[test]
413 fn operators_split_atom_runs() {
414 assert_eq!(
417 token_kinds("a+b=c"),
418 vec![
419 SyntaxKind::MATH_TEXT, SyntaxKind::MATH_OPERATOR, SyntaxKind::MATH_TEXT, SyntaxKind::MATH_OPERATOR, SyntaxKind::MATH_TEXT, ]
425 );
426 assert_lossless("a+b=c");
427 }
428
429 #[test]
430 fn each_operator_char_is_its_own_token() {
431 for op in ["+", "-", "*", "=", "<", ">"] {
432 assert_eq!(
433 token_kinds(op),
434 vec![SyntaxKind::MATH_OPERATOR],
435 "operator {op:?}"
436 );
437 assert_lossless(op);
438 }
439 assert_eq!(
441 token_kinds("a<=b"),
442 vec![
443 SyntaxKind::MATH_TEXT,
444 SyntaxKind::MATH_OPERATOR, SyntaxKind::MATH_OPERATOR, SyntaxKind::MATH_TEXT,
447 ]
448 );
449 assert_eq!(
451 token_kinds("-x"),
452 vec![SyntaxKind::MATH_OPERATOR, SyntaxKind::MATH_TEXT]
453 );
454 assert_lossless("-x");
455 assert_eq!(token_kinds(r"\<"), vec![SyntaxKind::MATH_COMMAND]);
457 assert_lossless(r"\<");
458 }
459
460 #[test]
461 fn operators_inside_groups_and_scripts_are_lossless() {
462 for content in [r"e^{-x}", r"10^{-3}", r"\frac{a+b}{c-d}", r"x_{i+1}"] {
463 assert_lossless(content);
464 }
465 }
466
467 #[test]
468 fn control_word_and_symbol() {
469 assert_eq!(
470 token_kinds(r"\alpha\,"),
471 vec![SyntaxKind::MATH_COMMAND, SyntaxKind::MATH_COMMAND]
472 );
473 assert_lossless(r"\alpha\,");
474 assert_eq!(token_kinds(r"\&\%\{\}"), vec![SyntaxKind::MATH_COMMAND; 4]);
476 assert_lossless(r"\&\%\{\}");
477 }
478
479 #[test]
480 fn brace_group_nests() {
481 let tree = node(r"x^{2}");
482 let group = tree
483 .descendants()
484 .find(|n| n.kind() == SyntaxKind::MATH_GROUP)
485 .expect("group");
486 let kinds: Vec<_> = group.children_with_tokens().map(|el| el.kind()).collect();
487 assert_eq!(
488 kinds,
489 vec![
490 SyntaxKind::MATH_GROUP_OPEN,
491 SyntaxKind::MATH_TEXT,
492 SyntaxKind::MATH_GROUP_CLOSE
493 ]
494 );
495 assert_lossless(r"x^{2}");
496 }
497
498 #[test]
499 fn line_break_alignment_and_scripts() {
500 assert_eq!(
501 token_kinds(r"x &= 1 \\"),
502 vec![
503 SyntaxKind::MATH_TEXT, SyntaxKind::MATH_SPACE, SyntaxKind::MATH_ALIGN, SyntaxKind::MATH_OPERATOR, SyntaxKind::MATH_SPACE, SyntaxKind::MATH_TEXT, SyntaxKind::MATH_SPACE, SyntaxKind::MATH_LINE_BREAK, ]
512 );
513 assert_lossless(r"x &= 1 \\");
514 assert_eq!(
515 token_kinds("x^2_i"),
516 vec![
517 SyntaxKind::MATH_TEXT,
518 SyntaxKind::MATH_SCRIPT,
519 SyntaxKind::MATH_TEXT,
520 SyntaxKind::MATH_SCRIPT,
521 SyntaxKind::MATH_TEXT,
522 ]
523 );
524 }
525
526 #[test]
527 fn environment_wraps_body() {
528 let content = "\\begin{aligned}\nx &= 1\n\\end{aligned}";
529 let tree = node(content);
530 let env = tree
531 .descendants()
532 .find(|n| n.kind() == SyntaxKind::MATH_ENVIRONMENT)
533 .expect("environment");
534 assert_eq!(env.text().to_string(), content);
535 let commands = env
536 .children_with_tokens()
537 .filter(|el| el.kind() == SyntaxKind::MATH_COMMAND)
538 .count();
539 assert_eq!(commands, 2);
540 assert_lossless(content);
541 assert!(
542 codes(content).is_empty(),
543 "well-formed env has no diagnostics"
544 );
545 }
546
547 #[test]
548 fn nested_environments() {
549 let content = r"\begin{a}\begin{b}x\end{b}\end{a}";
550 let envs = node(content)
551 .descendants()
552 .filter(|n| n.kind() == SyntaxKind::MATH_ENVIRONMENT)
553 .count();
554 assert_eq!(envs, 2);
555 assert_lossless(content);
556 assert!(codes(content).is_empty());
557 }
558
559 #[test]
560 fn comment_runs_to_end_of_line() {
561 assert_eq!(
562 token_kinds("a % tail\nb"),
563 vec![
564 SyntaxKind::MATH_TEXT,
565 SyntaxKind::MATH_SPACE,
566 SyntaxKind::MATH_COMMENT,
567 SyntaxKind::MATH_NEWLINE,
568 SyntaxKind::MATH_TEXT,
569 ]
570 );
571 assert_lossless("a % tail\nb");
572 }
573
574 #[test]
575 fn crlf_and_unicode_are_lossless() {
576 assert_lossless("x &= 1\r\ny &= 2\r\n");
577 assert_lossless(r"\alpha + \beta \neq \gamma_{\text{αβγ}}");
578 }
579
580 #[test]
581 fn empty_content() {
582 assert_eq!(node("").text().to_string(), "");
583 assert!(token_kinds("").is_empty());
584 }
585
586 #[test]
587 fn trailing_backslash() {
588 assert_eq!(
589 token_kinds("a\\"),
590 vec![SyntaxKind::MATH_TEXT, SyntaxKind::MATH_COMMAND]
591 );
592 assert_lossless("a\\");
593 }
594
595 #[test]
598 fn unclosed_group_is_lossless_and_diagnosed() {
599 assert_lossless("{a");
600 assert_eq!(codes("{a"), vec![diagnostic_codes::UNCLOSED_GROUP]);
601 }
602
603 #[test]
604 fn stray_close_brace_is_lossless_and_diagnosed() {
605 assert_lossless("a}b");
606 assert_eq!(codes("a}b"), vec![diagnostic_codes::UNEXPECTED_CLOSE_BRACE]);
607 }
608
609 #[test]
610 fn unclosed_environment_is_diagnosed() {
611 let content = r"\begin{aligned} x &= 1";
612 assert_lossless(content);
613 assert_eq!(codes(content), vec![diagnostic_codes::UNCLOSED_ENVIRONMENT]);
614 }
615
616 #[test]
617 fn mismatched_environment_is_diagnosed() {
618 let content = r"\begin{aligned}x\end{matrix}";
619 assert_lossless(content);
620 assert_eq!(
621 codes(content),
622 vec![diagnostic_codes::MISMATCHED_ENVIRONMENT]
623 );
624 }
625
626 #[test]
627 fn stray_end_is_diagnosed() {
628 let content = r"x \end{aligned}";
629 assert_lossless(content);
630 assert_eq!(codes(content), vec![diagnostic_codes::UNEXPECTED_END]);
631 }
632
633 #[test]
634 fn well_formed_math_has_no_diagnostics() {
635 assert!(codes(r"\frac{1}{2} + x^{2}").is_empty());
636 }
637
638 const BOOKDOWN: MathParseOptions = MathParseOptions {
641 bookdown_equation_labels: true,
642 };
643
644 fn label_kinds(content: &str, opts: MathParseOptions) -> Vec<SyntaxKind> {
645 node_with(content, opts)
646 .descendants_with_tokens()
647 .filter_map(|el| el.into_token())
648 .map(|tok| tok.kind())
649 .collect()
650 }
651
652 #[test]
653 fn equation_label_recognized_when_enabled() {
654 let kinds = label_kinds(r"a (\#eq:foo)", BOOKDOWN);
655 assert!(kinds.contains(&SyntaxKind::MATH_EQUATION_LABEL));
656 let label = node_with(r"a (\#eq:foo)", BOOKDOWN)
658 .descendants_with_tokens()
659 .filter_map(|el| el.into_token())
660 .find(|t| t.kind() == SyntaxKind::MATH_EQUATION_LABEL)
661 .expect("label token");
662 assert_eq!(label.text(), r"(\#eq:foo)");
663 }
664
665 #[test]
666 fn equation_label_ignored_when_disabled() {
667 let kinds = label_kinds(r"a (\#eq:foo)", MathParseOptions::default());
669 assert!(!kinds.contains(&SyntaxKind::MATH_EQUATION_LABEL));
670 }
671
672 #[test]
673 fn plain_parens_unchanged_when_disabled() {
674 assert_eq!(token_kinds("f(x)"), vec![SyntaxKind::MATH_TEXT]);
676 }
677
678 #[test]
679 fn label_parsing_is_lossless() {
680 let content = "\\begin{align}\n a (\\#eq:solveG)\n\\end{align}";
681 assert_eq!(node_with(content, BOOKDOWN).text().to_string(), content);
682 }
683}