Skip to main content

panache_parser/parser/
math.rs

1//! In-tree TeX math content parser.
2//!
3//! Produces a lossless structural CST for the *content* between math
4//! delimiters (the delimiters themselves are owned by the host `INLINE_MATH` /
5//! `DISPLAY_MATH` nodes, see `parser/inlines/math.rs`). The returned subtree is
6//! rooted at [`SyntaxKind::MATH_CONTENT`] and is spliced directly into the host
7//! document tree, replacing the opaque content `TEXT` token.
8//!
9//! This is a *syntactic* parse, not a semantic one: TeX is a Turing-complete
10//! macro language, so we only capture structure that a formatter can safely act
11//! on — brace groups, `\begin`/`\end` environments, control sequences,
12//! alignment tabs (`&`), line breaks (`\\`), sub/superscript markers, comments,
13//! and whitespace. Everything else is an ordinary-atom run ([`MATH_TEXT`]).
14//!
15//! Two outputs, two channels — the same split YAML uses (see
16//! `parser/yaml/model.rs`) and that texlab uses for LaTeX:
17//!
18//! - the **CST is lossless and never fails** (`node.text() == content` for every
19//!   input; worst case is a single `MATH_TEXT` atom), and
20//! - **errors ride a side-channel** ([`MathParseReport::diagnostics`]) so the
21//!   linter (and by proxy the LSP) can surface unbalanced braces and mismatched
22//!   environments without the parser ever rejecting input.
23//!
24//! [`MATH_TEXT`]: SyntaxKind::MATH_TEXT
25
26use crate::parser::inlines::bookdown::try_parse_bookdown_equation_definition;
27use crate::syntax::SyntaxKind;
28use rowan::{GreenNode, GreenNodeBuilder};
29
30/// A non-fatal problem found while parsing math content. Byte offsets are
31/// relative to the math content string (the caller offsets them into host
32/// document coordinates when surfacing through the linter/LSP).
33#[derive(Debug, Clone, PartialEq, Eq)]
34pub struct MathDiagnostic {
35    pub code: &'static str,
36    pub message: &'static str,
37    pub byte_start: usize,
38    pub byte_end: usize,
39}
40
41/// The lossless CST plus any diagnostics gathered on the side-channel.
42#[derive(Debug, Clone)]
43pub struct MathParseReport {
44    pub green: GreenNode,
45    pub diagnostics: Vec<MathDiagnostic>,
46}
47
48/// Stable diagnostic codes for math content. Mirrors `yaml::diagnostic_codes`.
49pub mod diagnostic_codes {
50    /// A `{` was never closed before the end of the math content.
51    pub const UNCLOSED_GROUP: &str = "MATH_UNCLOSED_GROUP";
52    /// A `}` appeared with no matching `{`.
53    pub const UNEXPECTED_CLOSE_BRACE: &str = "MATH_UNEXPECTED_CLOSE_BRACE";
54    /// A `\begin{env}` was never closed by a matching `\end{env}`.
55    pub const UNCLOSED_ENVIRONMENT: &str = "MATH_UNCLOSED_ENVIRONMENT";
56    /// A `\begin{a}` was closed by `\end{b}` with a different name.
57    pub const MISMATCHED_ENVIRONMENT: &str = "MATH_MISMATCHED_ENVIRONMENT";
58    /// An `\end` appeared with no open `\begin`.
59    pub const UNEXPECTED_END: &str = "MATH_UNEXPECTED_END";
60}
61
62/// Flavor-/extension-dependent parsing options for math content. Default is
63/// all-off (pure TeX). The math grammar itself is flavor-agnostic; only
64/// constructs layered on top of TeX by a Markdown flavor live here.
65#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
66pub struct MathParseOptions {
67    /// Recognize bookdown equation labels `(\#eq:label)` as a single
68    /// [`SyntaxKind::MATH_EQUATION_LABEL`] token (gated on the
69    /// `bookdown_equation_references` extension).
70    pub bookdown_equation_labels: bool,
71}
72
73/// Parse math content into a lossless `MATH_CONTENT` green node, discarding
74/// diagnostics. `content` is the raw text between (but excluding) the math
75/// delimiters.
76pub fn parse_math_content(content: &str, opts: MathParseOptions) -> GreenNode {
77    parse_math_report(content, opts).green
78}
79
80/// Parse math content into a lossless CST plus a side-channel of diagnostics.
81pub fn parse_math_report(content: &str, opts: MathParseOptions) -> MathParseReport {
82    let mut parser = MathParser {
83        input: content,
84        pos: 0,
85        builder: GreenNodeBuilder::new(),
86        diagnostics: Vec::new(),
87        opts,
88    };
89    parser.builder.start_node(SyntaxKind::MATH_CONTENT.into());
90    parser.parse_elements(Ctx::Top);
91    parser.builder.finish_node();
92    MathParseReport {
93        green: parser.builder.finish(),
94        diagnostics: parser.diagnostics,
95    }
96}
97
98/// Parse context, controlling which delimiter ends the current element run.
99#[derive(Debug, Clone, Copy, PartialEq, Eq)]
100enum Ctx {
101    /// Top level of the math content.
102    Top,
103    /// Inside a `{ ... }` brace group; stops at the matching `}`.
104    Group,
105    /// Inside a `\begin{env} ... \end{env}` body; stops at `\end`.
106    Env,
107}
108
109struct MathParser<'a> {
110    input: &'a str,
111    pos: usize,
112    builder: GreenNodeBuilder<'static>,
113    diagnostics: Vec<MathDiagnostic>,
114    opts: MathParseOptions,
115}
116
117impl MathParser<'_> {
118    fn rest(&self) -> &str {
119        &self.input[self.pos..]
120    }
121
122    fn peek_char(&self) -> Option<char> {
123        self.rest().chars().next()
124    }
125
126    fn diagnose(&mut self, code: &'static str, message: &'static str, start: usize, end: usize) {
127        self.diagnostics.push(MathDiagnostic {
128            code,
129            message,
130            byte_start: start,
131            byte_end: end,
132        });
133    }
134
135    /// Emit a token of `len` bytes (from the current position) with `kind`.
136    fn bump_bytes(&mut self, len: usize, kind: SyntaxKind) {
137        let text = &self.input[self.pos..self.pos + len];
138        self.builder.token(kind.into(), text);
139        self.pos += len;
140    }
141
142    /// If the cursor is at a control word (`\` followed by ASCII letters or
143    /// `@`, matching TeX/texlab's control-word class), return that word
144    /// (without the backslash) without consuming anything.
145    fn peek_control_word(&self) -> Option<&str> {
146        let after = self.rest().strip_prefix('\\')?;
147        let len: usize = after
148            .bytes()
149            .take_while(|b| b.is_ascii_alphabetic() || *b == b'@')
150            .count();
151        if len == 0 { None } else { Some(&after[..len]) }
152    }
153
154    fn parse_elements(&mut self, ctx: Ctx) {
155        while let Some(c) = self.peek_char() {
156            match c {
157                '}' if ctx == Ctx::Group => break,
158                // A `}` outside any group is an unmatched close: keep it as a
159                // faithful (stray) close token and flag it on the side-channel.
160                '}' => {
161                    self.diagnose(
162                        diagnostic_codes::UNEXPECTED_CLOSE_BRACE,
163                        "unmatched closing brace `}`",
164                        self.pos,
165                        self.pos + 1,
166                    );
167                    self.bump_bytes(1, SyntaxKind::MATH_GROUP_CLOSE);
168                }
169                '\\' => {
170                    if self.rest().starts_with("\\\\") {
171                        self.bump_bytes(2, SyntaxKind::MATH_LINE_BREAK);
172                    } else if let Some(word) = self.peek_control_word() {
173                        match word {
174                            "begin" => self.parse_environment(),
175                            "end" if ctx == Ctx::Env => break,
176                            "end" => {
177                                // Stray `\end` with no open `\begin` at this level.
178                                self.diagnose(
179                                    diagnostic_codes::UNEXPECTED_END,
180                                    "`\\end` without a matching `\\begin`",
181                                    self.pos,
182                                    self.pos + 1 + word.len(),
183                                );
184                                self.parse_control_word();
185                            }
186                            _ => self.parse_control_word(),
187                        }
188                    } else {
189                        self.parse_control_symbol();
190                    }
191                }
192                '{' => self.parse_group(),
193                // Bookdown equation label `(\#eq:label)`, only when enabled.
194                // A non-matching `(` falls through to an ordinary open delimiter.
195                '(' if self.opts.bookdown_equation_labels => match self.equation_label_len() {
196                    Some(len) => self.bump_bytes(len, SyntaxKind::MATH_EQUATION_LABEL),
197                    None => self.bump_bytes(1, SyntaxKind::MATH_OPEN),
198                },
199                // Delimiters and punctuation: their TeX mathcode class is fixed
200                // at the character level, so it is a CST fact (unlike operator
201                // class). The ambiguous `| . /` stay in MATH_TEXT.
202                '(' | '[' => self.bump_bytes(1, SyntaxKind::MATH_OPEN),
203                ')' | ']' => self.bump_bytes(1, SyntaxKind::MATH_CLOSE),
204                ',' | ';' => self.bump_bytes(1, SyntaxKind::MATH_PUNCT),
205                '&' => self.bump_bytes(1, SyntaxKind::MATH_ALIGN),
206                '^' | '_' => self.bump_bytes(1, SyntaxKind::MATH_SCRIPT),
207                // Operator atoms (`+ - * = < >`), one token per char. Class and
208                // precedence are *not* assigned here: TeX itself coerces a
209                // binary atom to ordinary by its neighbors (unary minus), so the
210                // class is a property of list position, owned by the formatter.
211                c if is_operator(c) => self.bump_bytes(1, SyntaxKind::MATH_OPERATOR),
212                '%' => self.parse_comment(),
213                ' ' | '\t' => self.parse_spaces(),
214                '\n' => self.bump_bytes(1, SyntaxKind::MATH_NEWLINE),
215                '\r' => {
216                    let len = if self.rest().starts_with("\r\n") {
217                        2
218                    } else {
219                        1
220                    };
221                    self.bump_bytes(len, SyntaxKind::MATH_NEWLINE);
222                }
223                _ => self.parse_text(),
224            }
225        }
226    }
227
228    /// `\begin{env} ... \end{env}`. Matching is done by recursion plus the
229    /// `Env` context; name mismatches and missing `\end` are reported on the
230    /// side-channel but never abort the parse.
231    fn parse_environment(&mut self) {
232        let begin_start = self.pos;
233        self.builder.start_node(SyntaxKind::MATH_ENVIRONMENT.into());
234        self.parse_control_word(); // \begin
235        let begin_name = self.parse_environment_name();
236        self.parse_elements(Ctx::Env);
237        if self.peek_control_word() == Some("end") {
238            let end_start = self.pos;
239            self.parse_control_word(); // \end
240            let end_name = self.parse_environment_name();
241            if begin_name != end_name {
242                self.diagnose(
243                    diagnostic_codes::MISMATCHED_ENVIRONMENT,
244                    "`\\end` name does not match the open `\\begin`",
245                    end_start,
246                    self.pos,
247                );
248            }
249        } else {
250            self.diagnose(
251                diagnostic_codes::UNCLOSED_ENVIRONMENT,
252                "`\\begin` without a matching `\\end`",
253                begin_start,
254                self.pos,
255            );
256        }
257        self.builder.finish_node();
258    }
259
260    /// Parse the `{name}` group following `\begin` / `\end` (if present) and
261    /// return the inner name text for matching. Empty when absent.
262    fn parse_environment_name(&mut self) -> String {
263        if self.peek_char() != Some('{') {
264            return String::new();
265        }
266        let open = self.pos;
267        self.parse_group();
268        // Inner text = the group span minus its braces.
269        self.input[open..self.pos]
270            .trim_start_matches('{')
271            .trim_end_matches('}')
272            .to_string()
273    }
274
275    fn parse_group(&mut self) {
276        let open = self.pos;
277        self.builder.start_node(SyntaxKind::MATH_GROUP.into());
278        self.bump_bytes(1, SyntaxKind::MATH_GROUP_OPEN); // {
279        self.parse_elements(Ctx::Group);
280        if self.peek_char() == Some('}') {
281            self.bump_bytes(1, SyntaxKind::MATH_GROUP_CLOSE); // }
282        } else {
283            self.diagnose(
284                diagnostic_codes::UNCLOSED_GROUP,
285                "unclosed `{` group",
286                open,
287                open + 1,
288            );
289        }
290        self.builder.finish_node();
291    }
292
293    /// `\` + a run of control-word characters (e.g. `\alpha`, `\frac`, `\begin`).
294    fn parse_control_word(&mut self) {
295        let word_len = self.peek_control_word().map(str::len).unwrap_or(0);
296        self.bump_bytes(1 + word_len, SyntaxKind::MATH_COMMAND);
297    }
298
299    /// `\` + exactly one following character (e.g. `\%`, `\{`, `\,`), or a
300    /// lone trailing backslash at EOF.
301    fn parse_control_symbol(&mut self) {
302        let after = &self.input[self.pos + 1..];
303        let len = 1 + after.chars().next().map(char::len_utf8).unwrap_or(0);
304        self.bump_bytes(len, SyntaxKind::MATH_COMMAND);
305    }
306
307    /// `%` to (but not including) the end of the line.
308    fn parse_comment(&mut self) {
309        let len = self
310            .rest()
311            .find(['\n', '\r'])
312            .unwrap_or_else(|| self.rest().len());
313        self.bump_bytes(len, SyntaxKind::MATH_COMMENT);
314    }
315
316    fn parse_spaces(&mut self) {
317        let len = self
318            .rest()
319            .bytes()
320            .take_while(|&b| b == b' ' || b == b'\t')
321            .count();
322        self.bump_bytes(len, SyntaxKind::MATH_SPACE);
323    }
324
325    /// A run of ordinary atoms, up to the next structural character. Delimiters
326    /// and punctuation (`( ) [ ] , ;`) bound the run too — they are now their
327    /// own tokens (including the `(` that the dispatcher's equation-label check
328    /// sees while the bookdown extension is on).
329    fn parse_text(&mut self) {
330        let len = self
331            .rest()
332            .find(|c: char| is_special(c))
333            .unwrap_or_else(|| self.rest().len());
334        debug_assert!(len > 0, "parse_text on a special char");
335        self.bump_bytes(len, SyntaxKind::MATH_TEXT);
336    }
337
338    /// If the cursor is at a bookdown equation label `(\#eq:label)`, return its
339    /// byte length. Reuses the shared bookdown definition parser so the
340    /// recognized span matches the rest of the codebase exactly.
341    fn equation_label_len(&self) -> Option<usize> {
342        try_parse_bookdown_equation_definition(self.rest()).map(|(len, _)| len)
343    }
344}
345
346/// Characters that terminate a [`SyntaxKind::MATH_TEXT`] run.
347fn is_special(c: char) -> bool {
348    is_operator(c)
349        || is_delimiter(c)
350        || matches!(
351            c,
352            '\\' | '{' | '}' | '&' | '^' | '_' | '%' | ' ' | '\t' | '\n' | '\r'
353        )
354}
355
356/// Delimiter/punctuation atoms split out of ordinary text into their own
357/// [`SyntaxKind::MATH_OPEN`]/[`SyntaxKind::MATH_CLOSE`]/[`SyntaxKind::MATH_PUNCT`]
358/// tokens. Their TeX mathcode class is fixed at the character level, so it is a
359/// CST fact; the ambiguous `| . /` are deliberately excluded (they stay text).
360fn is_delimiter(c: char) -> bool {
361    matches!(c, '(' | ')' | '[' | ']' | ',' | ';')
362}
363
364/// Operator atoms split out of ordinary text into their own
365/// [`SyntaxKind::MATH_OPERATOR`] token. The TeX mathbin (`+ - *`) and mathrel
366/// (`= < >`) core; the formatter assigns class/precedence/spacing downstream.
367fn is_operator(c: char) -> bool {
368    matches!(c, '+' | '-' | '*' | '=' | '<' | '>')
369}
370
371#[cfg(test)]
372mod tests {
373    use super::*;
374    use crate::syntax::SyntaxNode;
375
376    fn node(content: &str) -> SyntaxNode {
377        SyntaxNode::new_root(parse_math_content(content, MathParseOptions::default()))
378    }
379
380    fn node_with(content: &str, opts: MathParseOptions) -> SyntaxNode {
381        SyntaxNode::new_root(parse_math_content(content, opts))
382    }
383
384    fn token_kinds(content: &str) -> Vec<SyntaxKind> {
385        node(content)
386            .descendants_with_tokens()
387            .filter_map(|el| el.into_token())
388            .map(|tok| tok.kind())
389            .collect()
390    }
391
392    fn codes(content: &str) -> Vec<&'static str> {
393        parse_math_report(content, MathParseOptions::default())
394            .diagnostics
395            .into_iter()
396            .map(|d| d.code)
397            .collect()
398    }
399
400    /// Losslessness is the hard invariant for every input.
401    fn assert_lossless(content: &str) {
402        assert_eq!(
403            node(content).text().to_string(),
404            content,
405            "roundtrip: {content:?}"
406        );
407    }
408
409    #[test]
410    fn root_is_math_content() {
411        assert_eq!(node("x").kind(), SyntaxKind::MATH_CONTENT);
412    }
413
414    #[test]
415    fn plain_text_is_one_atom_run() {
416        // A run with no structural or operator chars stays a single atom.
417        assert_eq!(token_kinds("abc"), vec![SyntaxKind::MATH_TEXT]);
418        assert_lossless("abc");
419        // `/` and `.` are ambiguous, so they stay ordinary atoms (not operators
420        // and not delimiters); only the parens split out.
421        assert_eq!(
422            token_kinds("f(x)/2.5"),
423            vec![
424                SyntaxKind::MATH_TEXT,  // f
425                SyntaxKind::MATH_OPEN,  // (
426                SyntaxKind::MATH_TEXT,  // x
427                SyntaxKind::MATH_CLOSE, // )
428                SyntaxKind::MATH_TEXT,  // /2.5
429            ]
430        );
431        assert_lossless("f(x)/2.5");
432    }
433
434    #[test]
435    fn delimiters_and_punctuation_split_atom_runs() {
436        // `( [` open, `) ]` close, `, ;` punctuation — one token per char, with
437        // a fixed CST kind (their TeX mathcode class is character-level).
438        assert_eq!(
439            token_kinds("[a,b);"),
440            vec![
441                SyntaxKind::MATH_OPEN,  // [
442                SyntaxKind::MATH_TEXT,  // a
443                SyntaxKind::MATH_PUNCT, // ,
444                SyntaxKind::MATH_TEXT,  // b
445                SyntaxKind::MATH_CLOSE, // )
446                SyntaxKind::MATH_PUNCT, // ;
447            ]
448        );
449        assert_lossless("[a,b);");
450        // The ambiguous `| . /` are NOT delimiters — they stay in MATH_TEXT.
451        assert_eq!(token_kinds("a|b.c/d"), vec![SyntaxKind::MATH_TEXT]);
452        assert_lossless("a|b.c/d");
453        // An escaped delimiter stays a control symbol, never a delimiter token.
454        assert_eq!(token_kinds(r"\(\)\[\]"), vec![SyntaxKind::MATH_COMMAND; 4]);
455        assert_lossless(r"\(\)\[\]");
456    }
457
458    #[test]
459    fn operators_split_atom_runs() {
460        // `+ - * = < >` each break the surrounding text into their own
461        // MATH_OPERATOR token. Class/precedence is deferred to the formatter.
462        assert_eq!(
463            token_kinds("a+b=c"),
464            vec![
465                SyntaxKind::MATH_TEXT,     // a
466                SyntaxKind::MATH_OPERATOR, // +
467                SyntaxKind::MATH_TEXT,     // b
468                SyntaxKind::MATH_OPERATOR, // =
469                SyntaxKind::MATH_TEXT,     // c
470            ]
471        );
472        assert_lossless("a+b=c");
473    }
474
475    #[test]
476    fn each_operator_char_is_its_own_token() {
477        for op in ["+", "-", "*", "=", "<", ">"] {
478            assert_eq!(
479                token_kinds(op),
480                vec![SyntaxKind::MATH_OPERATOR],
481                "operator {op:?}"
482            );
483            assert_lossless(op);
484        }
485        // Adjacent operators do not coalesce — one token per char.
486        assert_eq!(
487            token_kinds("a<=b"),
488            vec![
489                SyntaxKind::MATH_TEXT,
490                SyntaxKind::MATH_OPERATOR, // <
491                SyntaxKind::MATH_OPERATOR, // =
492                SyntaxKind::MATH_TEXT,
493            ]
494        );
495        // Unary vs binary minus is NOT distinguished here — both are operators.
496        assert_eq!(
497            token_kinds("-x"),
498            vec![SyntaxKind::MATH_OPERATOR, SyntaxKind::MATH_TEXT]
499        );
500        assert_lossless("-x");
501        // An escaped special stays a control symbol, never an operator.
502        assert_eq!(token_kinds(r"\<"), vec![SyntaxKind::MATH_COMMAND]);
503        assert_lossless(r"\<");
504    }
505
506    #[test]
507    fn operators_inside_groups_and_scripts_are_lossless() {
508        for content in [r"e^{-x}", r"10^{-3}", r"\frac{a+b}{c-d}", r"x_{i+1}"] {
509            assert_lossless(content);
510        }
511    }
512
513    #[test]
514    fn control_word_and_symbol() {
515        assert_eq!(
516            token_kinds(r"\alpha\,"),
517            vec![SyntaxKind::MATH_COMMAND, SyntaxKind::MATH_COMMAND]
518        );
519        assert_lossless(r"\alpha\,");
520        // Escaped specials are control symbols, not structural markers.
521        assert_eq!(token_kinds(r"\&\%\{\}"), vec![SyntaxKind::MATH_COMMAND; 4]);
522        assert_lossless(r"\&\%\{\}");
523    }
524
525    #[test]
526    fn brace_group_nests() {
527        let tree = node(r"x^{2}");
528        let group = tree
529            .descendants()
530            .find(|n| n.kind() == SyntaxKind::MATH_GROUP)
531            .expect("group");
532        let kinds: Vec<_> = group.children_with_tokens().map(|el| el.kind()).collect();
533        assert_eq!(
534            kinds,
535            vec![
536                SyntaxKind::MATH_GROUP_OPEN,
537                SyntaxKind::MATH_TEXT,
538                SyntaxKind::MATH_GROUP_CLOSE
539            ]
540        );
541        assert_lossless(r"x^{2}");
542    }
543
544    #[test]
545    fn line_break_alignment_and_scripts() {
546        assert_eq!(
547            token_kinds(r"x &= 1 \\"),
548            vec![
549                SyntaxKind::MATH_TEXT,       // x
550                SyntaxKind::MATH_SPACE,      // ' '
551                SyntaxKind::MATH_ALIGN,      // &
552                SyntaxKind::MATH_OPERATOR,   // =
553                SyntaxKind::MATH_SPACE,      // ' '
554                SyntaxKind::MATH_TEXT,       // 1
555                SyntaxKind::MATH_SPACE,      // ' '
556                SyntaxKind::MATH_LINE_BREAK, // \\
557            ]
558        );
559        assert_lossless(r"x &= 1 \\");
560        assert_eq!(
561            token_kinds("x^2_i"),
562            vec![
563                SyntaxKind::MATH_TEXT,
564                SyntaxKind::MATH_SCRIPT,
565                SyntaxKind::MATH_TEXT,
566                SyntaxKind::MATH_SCRIPT,
567                SyntaxKind::MATH_TEXT,
568            ]
569        );
570    }
571
572    #[test]
573    fn environment_wraps_body() {
574        let content = "\\begin{aligned}\nx &= 1\n\\end{aligned}";
575        let tree = node(content);
576        let env = tree
577            .descendants()
578            .find(|n| n.kind() == SyntaxKind::MATH_ENVIRONMENT)
579            .expect("environment");
580        assert_eq!(env.text().to_string(), content);
581        let commands = env
582            .children_with_tokens()
583            .filter(|el| el.kind() == SyntaxKind::MATH_COMMAND)
584            .count();
585        assert_eq!(commands, 2);
586        assert_lossless(content);
587        assert!(
588            codes(content).is_empty(),
589            "well-formed env has no diagnostics"
590        );
591    }
592
593    #[test]
594    fn nested_environments() {
595        let content = r"\begin{a}\begin{b}x\end{b}\end{a}";
596        let envs = node(content)
597            .descendants()
598            .filter(|n| n.kind() == SyntaxKind::MATH_ENVIRONMENT)
599            .count();
600        assert_eq!(envs, 2);
601        assert_lossless(content);
602        assert!(codes(content).is_empty());
603    }
604
605    #[test]
606    fn comment_runs_to_end_of_line() {
607        assert_eq!(
608            token_kinds("a % tail\nb"),
609            vec![
610                SyntaxKind::MATH_TEXT,
611                SyntaxKind::MATH_SPACE,
612                SyntaxKind::MATH_COMMENT,
613                SyntaxKind::MATH_NEWLINE,
614                SyntaxKind::MATH_TEXT,
615            ]
616        );
617        assert_lossless("a % tail\nb");
618    }
619
620    #[test]
621    fn crlf_and_unicode_are_lossless() {
622        assert_lossless("x &= 1\r\ny &= 2\r\n");
623        assert_lossless(r"\alpha + \beta \neq \gamma_{\text{αβγ}}");
624    }
625
626    #[test]
627    fn empty_content() {
628        assert_eq!(node("").text().to_string(), "");
629        assert!(token_kinds("").is_empty());
630    }
631
632    #[test]
633    fn trailing_backslash() {
634        assert_eq!(
635            token_kinds("a\\"),
636            vec![SyntaxKind::MATH_TEXT, SyntaxKind::MATH_COMMAND]
637        );
638        assert_lossless("a\\");
639    }
640
641    // --- Diagnostics side-channel (lossless even when malformed) ---
642
643    #[test]
644    fn unclosed_group_is_lossless_and_diagnosed() {
645        assert_lossless("{a");
646        assert_eq!(codes("{a"), vec![diagnostic_codes::UNCLOSED_GROUP]);
647    }
648
649    #[test]
650    fn stray_close_brace_is_lossless_and_diagnosed() {
651        assert_lossless("a}b");
652        assert_eq!(codes("a}b"), vec![diagnostic_codes::UNEXPECTED_CLOSE_BRACE]);
653    }
654
655    #[test]
656    fn unclosed_environment_is_diagnosed() {
657        let content = r"\begin{aligned} x &= 1";
658        assert_lossless(content);
659        assert_eq!(codes(content), vec![diagnostic_codes::UNCLOSED_ENVIRONMENT]);
660    }
661
662    #[test]
663    fn mismatched_environment_is_diagnosed() {
664        let content = r"\begin{aligned}x\end{matrix}";
665        assert_lossless(content);
666        assert_eq!(
667            codes(content),
668            vec![diagnostic_codes::MISMATCHED_ENVIRONMENT]
669        );
670    }
671
672    #[test]
673    fn stray_end_is_diagnosed() {
674        let content = r"x \end{aligned}";
675        assert_lossless(content);
676        assert_eq!(codes(content), vec![diagnostic_codes::UNEXPECTED_END]);
677    }
678
679    #[test]
680    fn well_formed_math_has_no_diagnostics() {
681        assert!(codes(r"\frac{1}{2} + x^{2}").is_empty());
682    }
683
684    // --- Bookdown equation labels (gated on the extension) ---
685
686    const BOOKDOWN: MathParseOptions = MathParseOptions {
687        bookdown_equation_labels: true,
688    };
689
690    fn label_kinds(content: &str, opts: MathParseOptions) -> Vec<SyntaxKind> {
691        node_with(content, opts)
692            .descendants_with_tokens()
693            .filter_map(|el| el.into_token())
694            .map(|tok| tok.kind())
695            .collect()
696    }
697
698    #[test]
699    fn equation_label_recognized_when_enabled() {
700        let kinds = label_kinds(r"a (\#eq:foo)", BOOKDOWN);
701        assert!(kinds.contains(&SyntaxKind::MATH_EQUATION_LABEL));
702        // The label is a single token spanning the whole `(\#eq:foo)`.
703        let label = node_with(r"a (\#eq:foo)", BOOKDOWN)
704            .descendants_with_tokens()
705            .filter_map(|el| el.into_token())
706            .find(|t| t.kind() == SyntaxKind::MATH_EQUATION_LABEL)
707            .expect("label token");
708        assert_eq!(label.text(), r"(\#eq:foo)");
709    }
710
711    #[test]
712    fn equation_label_ignored_when_disabled() {
713        // Default options: no label token, and plain math is byte-identical.
714        let kinds = label_kinds(r"a (\#eq:foo)", MathParseOptions::default());
715        assert!(!kinds.contains(&SyntaxKind::MATH_EQUATION_LABEL));
716    }
717
718    #[test]
719    fn plain_parens_tokenize_the_same_with_or_without_bookdown() {
720        // A non-label `(` is an ordinary open delimiter in both modes; only a
721        // genuine `(\#eq:...)` label is special, and only when the extension is
722        // on. So `f(x)` tokenizes identically either way.
723        let expected = vec![
724            SyntaxKind::MATH_TEXT,  // f
725            SyntaxKind::MATH_OPEN,  // (
726            SyntaxKind::MATH_TEXT,  // x
727            SyntaxKind::MATH_CLOSE, // )
728        ];
729        assert_eq!(token_kinds("f(x)"), expected);
730        assert_eq!(label_kinds("f(x)", BOOKDOWN), expected);
731    }
732
733    #[test]
734    fn label_parsing_is_lossless() {
735        let content = "\\begin{align}\n  a (\\#eq:solveG)\n\\end{align}";
736        assert_eq!(node_with(content, BOOKDOWN).text().to_string(), content);
737    }
738}