Skip to main content

mdwright_math/
scan.rs

1//! Structural math recogniser.
2//!
3//! Walks `source` left-to-right with exclusion zones derived from the
4//! IR's inline / block atoms (code spans, code blocks, HTML blocks,
5//! inline HTML). Inside an exclusion the scanner skips ahead to the
6//! zone's end, so `$` inside `` `cost is $5` `` or `<a title="$x$">`
7//! cannot anchor a math region.
8//!
9//! Three opener families are recognised:
10//!
11//! - Delimited pairs: `\[ … \]`, `\( … \)`, `$$ … $$`, `$ … $`.
12//!   Greedy first-close matches the heuristic scanner's behaviour and
13//!   the way `KaTeX` / pandoc resolve these in practice.
14//! - LaTeX environments: `\begin{name} … \end{name}`. The recogniser
15//!   counts nested `\begin{name}` so an inner environment of the same
16//!   name does not close the outer.
17//!
18//! Each recognised region carries a [`MathSpan`] tag (inline, display,
19//! or environment) with the body byte range.
20//!
21//! Unmatched openers become [`MathError`] values without aborting the
22//! scan. Brace imbalance inside a recognised body is checked once per
23//! region and surfaces as [`MathError::UnbalancedBraces`]; the region
24//! still scans because its markers are balanced; canonicalisation
25//! skips body rewrites for that region.
26
27use std::ops::Range;
28
29use super::MathRegion;
30use super::env::{EnvKind, KnownEnv};
31use super::span::{AnyDelim, DisplayDelim, InlineDelim, MathBody, MathError, MathSpan};
32
33/// Which math delimiter pairs to recognise.
34///
35/// Defaults recognise `\(…\)`, `\[…\]`, and LaTeX environments. The
36/// dollar variants are opt-in because `$` collides with currency
37/// symbols and shell prompts in non-math prose.
38#[derive(Copy, Clone, Debug)]
39#[allow(clippy::struct_excessive_bools)]
40pub struct MathConfig {
41    pub backslash_bracket: bool,
42    pub backslash_paren: bool,
43    pub double_dollar: bool,
44    pub single_dollar: bool,
45    /// LaTeX `\begin{env}…\end{env}` recognition. Defaults to `true`:
46    /// environments outside `\[ \]` are common in mathematical prose
47    /// (e.g. raw `\begin{align}` blocks rendered by `KaTeX`) and have
48    /// unambiguous closers, unlike `$`.
49    pub environments: bool,
50}
51
52impl Default for MathConfig {
53    fn default() -> Self {
54        Self {
55            backslash_bracket: true,
56            backslash_paren: true,
57            double_dollar: false,
58            single_dollar: false,
59            environments: true,
60        }
61    }
62}
63
64/// Scan `source` for math regions. Returns regions in source order
65/// (non-overlapping) and any unmatched openers / brace-imbalanced
66/// bodies as errors.
67///
68/// `transparent_runs` is a sorted, non-overlapping slice of byte
69/// ranges the lexer must treat as if they do not exist — blockquote
70/// `>` markers and list-item continuation indentation on continuation
71/// lines. Math regions may cross transparent runs (they are not
72/// region boundaries the way exclusion zones are); the runs are
73/// recorded on each body so [`MathBody::as_str`] can yield clean
74/// math content.
75#[tracing::instrument(
76    level = "debug",
77    skip_all,
78    fields(len = source.len(), transparent = transparent_runs.len()),
79)]
80pub fn scan_math_regions(
81    source: &str,
82    exclusions: &[Range<usize>],
83    transparent_runs: &[Range<usize>],
84    cfg: MathConfig,
85) -> (Vec<MathRegion>, Vec<MathError>) {
86    let bytes = source.as_bytes();
87    let mut regions: Vec<MathRegion> = Vec::new();
88    let mut errors: Vec<MathError> = Vec::new();
89    let mut i = 0usize;
90    while i < bytes.len() {
91        if let Some(end) = excluded_end(exclusions, i) {
92            i = end;
93            continue;
94        }
95        if let Some(end) = transparent_end(transparent_runs, i) {
96            i = end;
97            continue;
98        }
99        // Environments first: `\begin{name}` is structurally
100        // unambiguous and would otherwise be passed over.
101        if cfg.environments
102            && let Some((env_name, name_range, after_begin)) = match_begin(source, bytes, i)
103        {
104            match find_end_env(source, bytes, after_begin, env_name, exclusions, transparent_runs) {
105                Some((end_start, end_after)) => {
106                    let region = i..end_after;
107                    let body_range = after_begin..end_start;
108                    let env = match KnownEnv::from_name(env_name) {
109                        Some(k) => EnvKind::Known(k),
110                        None => EnvKind::Custom(name_range),
111                    };
112                    let body = build_math_body(body_range.clone(), transparent_runs);
113                    record_brace_errors(source, &region, &body, &mut errors);
114                    let span = MathSpan::Environment { env, body };
115                    regions.push(MathRegion::new(region.clone(), span));
116                    tracing::debug!(
117                        env = env_name,
118                        range = ?region,
119                        stripped = !body_runs_empty(&body_range, transparent_runs),
120                        "env region",
121                    );
122                    i = end_after;
123                    continue;
124                }
125                None => {
126                    errors.push(MathError::UnbalancedEnv {
127                        name: env_name.to_string(),
128                        range: i..after_begin,
129                    });
130                    i = after_begin;
131                    continue;
132                }
133            }
134        }
135        let Some((delim, open_len)) = match_open(bytes, i, cfg) else {
136            i = i.saturating_add(1);
137            continue;
138        };
139        let content_start = i.saturating_add(open_len);
140        match find_close(bytes, content_start, delim, exclusions, transparent_runs) {
141            Some(close_start) => {
142                // Reject backslash-delimited bodies with no alphanumeric
143                // content. A `\(...\)` or `\[...\]` whose body is only
144                // backslashes, brackets, or whitespace is almost certainly
145                // a sequence of CM backslash escapes, not math — GFM §6.1
146                // ex. 308's `\!\"...\(\)...\[\\\]...` is the canonical case.
147                // Without this guard the recogniser would treat `\(\)`
148                // (empty) and `\[\\\]` (body `\\`) as math, the formatter
149                // would normalise them, and the round-trip HTML would
150                // diverge from the source's escape-sequence rendering.
151                //
152                // This guard is scoped to `\[`/`\(` deliberately: `$` and
153                // `$$` cannot arise from CommonMark escape sequences, so a
154                // symbol-only body like `$+$`, `$<$`, or `$[-,-]$` is real
155                // math. Applying the guard to dollars used to skip the
156                // opener but leave the closing `$` to be re-scanned as a
157                // fresh opener, yielding a spurious `UnbalancedDelim`.
158                let body_slice = bytes.get(content_start..close_start).unwrap_or(&[]);
159                if matches!(delim, AnyDelim::Bracket | AnyDelim::Paren)
160                    && !body_slice.iter().any(u8::is_ascii_alphanumeric)
161                {
162                    i = i.saturating_add(1);
163                    continue;
164                }
165                let close_len = delim.close().len();
166                let region_end = close_start.saturating_add(close_len);
167                let region = i..region_end;
168                let body_range = content_start..close_start;
169                let body = build_math_body(body_range.clone(), transparent_runs);
170                record_brace_errors(source, &region, &body, &mut errors);
171                let span = match delim {
172                    AnyDelim::Paren => MathSpan::Inline {
173                        delim: InlineDelim::Paren,
174                        body,
175                    },
176                    AnyDelim::Dollar => MathSpan::Inline {
177                        delim: InlineDelim::Dollar,
178                        body,
179                    },
180                    AnyDelim::Bracket => MathSpan::Display {
181                        delim: DisplayDelim::Bracket,
182                        body,
183                    },
184                    AnyDelim::Dollar2 => MathSpan::Display {
185                        delim: DisplayDelim::Dollar2,
186                        body,
187                    },
188                };
189                regions.push(MathRegion::new(region.clone(), span));
190                tracing::debug!(
191                    delim = delim.open(),
192                    range = ?region,
193                    stripped = !body_runs_empty(&body_range, transparent_runs),
194                    "delim region",
195                );
196                i = region_end;
197            }
198            None => {
199                errors.push(MathError::UnbalancedDelim {
200                    delim,
201                    range: i..content_start,
202                });
203                i = content_start;
204            }
205        }
206    }
207    (regions, errors)
208}
209
210/// Collect the slice of transparent runs intersecting `body_range`
211/// and pack them into a [`MathBody`]. The recogniser keeps the
212/// invariant that `transparent_runs` is sorted and non-overlapping,
213/// so the intersection is contiguous.
214fn build_math_body(body_range: Range<usize>, transparent_runs: &[Range<usize>]) -> MathBody {
215    let runs: Box<[Range<usize>]> = transparent_runs
216        .iter()
217        .filter(|r| r.start < body_range.end && body_range.start < r.end)
218        .cloned()
219        .collect();
220    MathBody::new(body_range, runs)
221}
222
223/// True iff no transparent run intersects `body_range`. Cheap probe
224/// for the tracing debug field — the actual `Box<[Range]>` allocation
225/// happens once inside [`build_math_body`].
226fn body_runs_empty(body_range: &Range<usize>, transparent_runs: &[Range<usize>]) -> bool {
227    !transparent_runs
228        .iter()
229        .any(|r| r.start < body_range.end && body_range.start < r.end)
230}
231
232/// Push a `MathError::UnbalancedBraces` if `body`'s clean content has
233/// unbalanced `{` / `}`. Delegates to the shared validator in
234/// [`super::normalise::body_braces_balanced`] so the scanner, the
235/// canonicalise math rewrite, and the lint rule agree on what counts
236/// as balanced. The check runs over the clean body, so container
237/// prefixes cannot affect brace balance, and the local offset is
238/// mapped back to a source-absolute byte via
239/// [`MathBody::clean_offset_to_source`].
240fn record_brace_errors(source: &str, region: &Range<usize>, body: &MathBody, errors: &mut Vec<MathError>) {
241    let clean = body.as_str(source);
242    if let Err(local_offset) = super::normalise::body_braces_balanced(clean.as_ref()) {
243        errors.push(MathError::UnbalancedBraces {
244            offset: body.clean_offset_to_source(local_offset),
245            region: region.clone(),
246        });
247    }
248}
249
250fn excluded_end(exclusions: &[Range<usize>], i: usize) -> Option<usize> {
251    let idx = exclusions.partition_point(|r| r.start <= i);
252    if let Some(prev_idx) = idx.checked_sub(1)
253        && let Some(r) = exclusions.get(prev_idx)
254        && i < r.end
255    {
256        return Some(r.end);
257    }
258    None
259}
260
261/// True iff `i` lies inside any transparent run, returning the run's
262/// end. Mirrors [`excluded_end`] structurally but has different
263/// semantics: transparent runs do not bound math regions; the scanner
264/// and [`find_close`] / [`find_end_env`] use this to skip prefix
265/// bytes while keeping the surrounding region intact.
266fn transparent_end(transparent_runs: &[Range<usize>], i: usize) -> Option<usize> {
267    let idx = transparent_runs.partition_point(|r| r.start <= i);
268    if let Some(prev_idx) = idx.checked_sub(1)
269        && let Some(r) = transparent_runs.get(prev_idx)
270        && i < r.end
271    {
272        return Some(r.end);
273    }
274    None
275}
276
277/// Match `\begin{name}` at `i`. Returns `(name, byte range of the
278/// name, position after the closing `}`)`. The `\` must not be itself
279/// escaped (even-count of preceding backslashes).
280fn match_begin<'a>(source: &'a str, bytes: &[u8], i: usize) -> Option<(&'a str, Range<usize>, usize)> {
281    let after = match_kw(bytes, i, b"begin")?;
282    parse_env_name(source, after)
283}
284
285/// Match `\end{name}` at `j`. Returns `(name, byte range of the name,
286/// position after the closing `}`)`.
287fn match_end<'a>(source: &'a str, bytes: &[u8], j: usize) -> Option<(&'a str, Range<usize>, usize)> {
288    let after = match_kw(bytes, j, b"end")?;
289    parse_env_name(source, after)
290}
291
292/// Common prefix check for `\begin` / `\end`. Returns the position
293/// just after the keyword on success.
294fn match_kw(bytes: &[u8], i: usize, keyword: &[u8]) -> Option<usize> {
295    if bytes.get(i).copied() != Some(b'\\') {
296        return None;
297    }
298    if !preceding_backslashes_even(bytes, i) {
299        return None;
300    }
301    let kw_start = i.saturating_add(1);
302    let kw_end = kw_start.saturating_add(keyword.len());
303    if bytes.get(kw_start..kw_end) != Some(keyword) {
304        return None;
305    }
306    Some(kw_end)
307}
308
309/// Parse `{name}` starting at `after`, where `name` is `[A-Za-z]+\*?`
310/// (LaTeX environment name convention). Returns `(name, byte range of
311/// the name in `source`, position after the closing `}`)`.
312fn parse_env_name(source: &str, after: usize) -> Option<(&str, Range<usize>, usize)> {
313    let bytes = source.as_bytes();
314    if bytes.get(after).copied() != Some(b'{') {
315        return None;
316    }
317    let name_start = after.saturating_add(1);
318    let mut j = name_start;
319    while let Some(b) = bytes.get(j).copied() {
320        if b.is_ascii_alphabetic() {
321            j = j.saturating_add(1);
322        } else {
323            break;
324        }
325    }
326    // Optional trailing `*` for the unnumbered variants.
327    if bytes.get(j).copied() == Some(b'*') {
328        j = j.saturating_add(1);
329    }
330    if j == name_start {
331        return None;
332    }
333    if bytes.get(j).copied() != Some(b'}') {
334        return None;
335    }
336    let name = source.get(name_start..j)?;
337    Some((name, name_start..j, j.saturating_add(1)))
338}
339
340/// Find the matching `\end{name}` for an open environment. Returns
341/// the byte index of the `\` of `\end` and the byte index just after
342/// the closing `}` of `\end{name}`. Counts nested `\begin{name}` so
343/// inner environments of the same name do not close the outer.
344fn find_end_env(
345    source: &str,
346    bytes: &[u8],
347    from: usize,
348    name: &str,
349    exclusions: &[Range<usize>],
350    transparent_runs: &[Range<usize>],
351) -> Option<(usize, usize)> {
352    let mut depth: u32 = 1;
353    let mut j = from;
354    while j < bytes.len() {
355        if let Some(end) = excluded_end(exclusions, j) {
356            j = end;
357            continue;
358        }
359        if let Some(end) = transparent_end(transparent_runs, j) {
360            j = end;
361            continue;
362        }
363        if let Some((found_name, _, after)) = match_end(source, bytes, j) {
364            if found_name == name {
365                depth = depth.saturating_sub(1);
366                if depth == 0 {
367                    return Some((j, after));
368                }
369            }
370            j = after;
371            continue;
372        }
373        if let Some((found_name, _, after)) = match_begin(source, bytes, j) {
374            if found_name == name {
375                depth = depth.saturating_add(1);
376            }
377            j = after;
378            continue;
379        }
380        j = j.saturating_add(1);
381    }
382    None
383}
384
385/// Match a primitive delimiter opener at position `i`. Returns the
386/// matched delimiter and the byte length of the open token.
387fn match_open(bytes: &[u8], i: usize, cfg: MathConfig) -> Option<(AnyDelim, usize)> {
388    let b = bytes.get(i).copied()?;
389    match b {
390        b'\\' => {
391            if !preceding_backslashes_even(bytes, i) {
392                return None;
393            }
394            let next = bytes.get(i.saturating_add(1)).copied()?;
395            match next {
396                b'[' if cfg.backslash_bracket => Some((AnyDelim::Bracket, 2)),
397                b'(' if cfg.backslash_paren => Some((AnyDelim::Paren, 2)),
398                _ => None,
399            }
400        }
401        b'$' => {
402            // `\$` is a CommonMark-escaped literal dollar, not a delimiter.
403            if !preceding_backslashes_even(bytes, i) {
404                return None;
405            }
406            let two = bytes.get(i.saturating_add(1)).copied();
407            if cfg.double_dollar && two == Some(b'$') {
408                Some((AnyDelim::Dollar2, 2))
409            } else if cfg.single_dollar {
410                Some((AnyDelim::Dollar, 1))
411            } else {
412                None
413            }
414        }
415        _ => None,
416    }
417}
418
419/// Count the run of `\` bytes ending immediately before `i` and
420/// return true iff the count is even (so `bytes[i]` itself starts a
421/// fresh, unescaped sequence).
422fn preceding_backslashes_even(bytes: &[u8], i: usize) -> bool {
423    let mut j = i;
424    let mut count = 0usize;
425    while j > 0 {
426        let prev = j.saturating_sub(1);
427        if bytes.get(prev).copied() == Some(b'\\') {
428            count = count.saturating_add(1);
429            j = prev;
430        } else {
431            break;
432        }
433    }
434    count.is_multiple_of(2)
435}
436
437/// Search for the matching close delimiter starting at `from`.
438fn find_close(
439    bytes: &[u8],
440    from: usize,
441    delim: AnyDelim,
442    exclusions: &[Range<usize>],
443    transparent_runs: &[Range<usize>],
444) -> Option<usize> {
445    let mut j = from;
446    while j < bytes.len() {
447        if excluded_end(exclusions, j).is_some() {
448            // Math regions don't cross an exclusion boundary.
449            return None;
450        }
451        if let Some(end) = transparent_end(transparent_runs, j) {
452            // Transparent bytes (container prefixes) are not part of
453            // the math content; skip past them and keep looking for
454            // the close.
455            j = end;
456            continue;
457        }
458        match delim {
459            AnyDelim::Bracket | AnyDelim::Paren => {
460                if bytes.get(j).copied() == Some(b'\\')
461                    && bytes.get(j.saturating_add(1)).copied() == Some(close_target_byte(delim))
462                    && preceding_backslashes_even(bytes, j)
463                {
464                    return Some(j);
465                }
466            }
467            AnyDelim::Dollar2 => {
468                if bytes.get(j).copied() == Some(b'$')
469                    && bytes.get(j.saturating_add(1)).copied() == Some(b'$')
470                    && preceding_backslashes_even(bytes, j)
471                {
472                    return Some(j);
473                }
474            }
475            AnyDelim::Dollar => {
476                if bytes.get(j).copied() == Some(b'$') && preceding_backslashes_even(bytes, j) {
477                    return Some(j);
478                }
479            }
480        }
481        j = j.saturating_add(1);
482    }
483    None
484}
485
486const fn close_target_byte(delim: AnyDelim) -> u8 {
487    match delim {
488        AnyDelim::Bracket => b']',
489        AnyDelim::Paren => b')',
490        AnyDelim::Dollar2 | AnyDelim::Dollar => b'$',
491    }
492}
493
494#[cfg(test)]
495#[allow(clippy::indexing_slicing, clippy::panic)]
496mod tests {
497    use std::borrow::Cow;
498
499    use super::*;
500
501    fn scan(source: &str) -> (Vec<MathRegion>, Vec<MathError>) {
502        scan_math_regions(source, &[], &[], MathConfig::default())
503    }
504
505    fn scan_with_runs(
506        source: &str,
507        transparent_runs: &[Range<usize>],
508        cfg: MathConfig,
509    ) -> (Vec<MathRegion>, Vec<MathError>) {
510        scan_math_regions(source, &[], transparent_runs, cfg)
511    }
512
513    fn regions(source: &str) -> Vec<MathRegion> {
514        scan(source).0
515    }
516
517    #[test]
518    fn display_math_single_line() {
519        let s = r"prefix \[ A \] suffix";
520        let regs = regions(s);
521        assert_eq!(regs.len(), 1);
522        assert_eq!(&s[regs[0].range.clone()], r"\[ A \]");
523        assert!(matches!(
524            regs[0].span(),
525            MathSpan::Display {
526                delim: DisplayDelim::Bracket,
527                ..
528            }
529        ));
530    }
531
532    #[test]
533    fn display_math_multi_line() {
534        let s = "before \\[\n  A \\to B\n\\] after";
535        let regs = regions(s);
536        assert_eq!(regs.len(), 1);
537        let span = &s[regs[0].range.clone()];
538        assert!(span.starts_with(r"\["));
539        assert!(span.ends_with(r"\]"));
540    }
541
542    #[test]
543    fn inline_math_paren() {
544        let s = r"x is \( a + b \) units";
545        let regs = regions(s);
546        assert_eq!(regs.len(), 1);
547        assert_eq!(&s[regs[0].range.clone()], r"\( a + b \)");
548        assert!(matches!(
549            regs[0].span(),
550            MathSpan::Inline {
551                delim: InlineDelim::Paren,
552                ..
553            }
554        ));
555    }
556
557    #[test]
558    fn two_separate_regions() {
559        let s = r"see \[ A \] and \[ B \] both";
560        let regs = regions(s);
561        assert_eq!(regs.len(), 2);
562        assert!(regs[0].range.end <= regs[1].range.start);
563    }
564
565    #[test]
566    fn unbalanced_open_drops_region_and_emits_error() {
567        let s = r"start \[ no close here";
568        let (regs, errs) = scan(s);
569        assert!(regs.is_empty());
570        assert_eq!(errs.len(), 1);
571        match &errs[0] {
572            MathError::UnbalancedDelim { delim, .. } => {
573                assert!(delim.is_display());
574                assert_eq!(delim.open(), r"\[");
575                assert_eq!(delim.close(), r"\]");
576            }
577            MathError::UnbalancedEnv { .. } | MathError::UnbalancedBraces { .. } => {
578                panic!("expected delim error")
579            }
580        }
581    }
582
583    #[test]
584    fn greedy_first_close() {
585        let s = r"\[ a \[ b \] c \]";
586        let regs = regions(s);
587        assert_eq!(regs.len(), 1);
588        assert_eq!(&s[regs[0].range.clone()], r"\[ a \[ b \]");
589    }
590
591    #[test]
592    fn double_backslash_open_is_not_math() {
593        let s = r"foo \\[ not math \] bar";
594        assert!(regions(s).is_empty());
595    }
596
597    #[test]
598    fn triple_backslash_open_is_math() {
599        let s = r"foo \\\[ A \] bar";
600        assert_eq!(regions(s).len(), 1);
601    }
602
603    #[test]
604    #[allow(
605        clippy::single_range_in_vec_init,
606        reason = "test intentionally passes one exclusion range"
607    )]
608    fn region_inside_code_span_excluded() {
609        let s = r"text `\[ x \]` more";
610        let exclusions = [5..14];
611        let (regs, _) = scan_math_regions(s, &exclusions, &[], MathConfig::default());
612        assert!(regs.is_empty());
613    }
614
615    #[test]
616    #[allow(
617        clippy::single_range_in_vec_init,
618        reason = "test intentionally passes one exclusion range"
619    )]
620    fn region_inside_code_block_excluded() {
621        let s = "```\n\\[ x \\]\n```";
622        let exclusions = [0..s.len()];
623        let (regs, _) = scan_math_regions(s, &exclusions, &[], MathConfig::default());
624        assert!(regs.is_empty());
625    }
626
627    #[test]
628    #[allow(
629        clippy::single_range_in_vec_init,
630        reason = "test intentionally passes one exclusion range"
631    )]
632    fn region_inside_inline_html_excluded() {
633        let s = r#"see <a href="/x?val=$foo">x</a> after"#;
634        let exclusions = [4..26];
635        let cfg = MathConfig {
636            single_dollar: true,
637            ..MathConfig::default()
638        };
639        let (regs, _) = scan_math_regions(s, &exclusions, &[], cfg);
640        assert!(regs.is_empty());
641    }
642
643    #[test]
644    fn dollar_variants_off_by_default() {
645        let s = "value is $5 today, plus $$2 tomorrow";
646        assert!(regions(s).is_empty());
647    }
648
649    #[test]
650    fn double_dollar_when_enabled() {
651        let s = "see $$ x = 5 $$ above";
652        let cfg = MathConfig {
653            double_dollar: true,
654            ..MathConfig::default()
655        };
656        let (regs, _) = scan_math_regions(s, &[], &[], cfg);
657        assert_eq!(regs.len(), 1);
658        assert_eq!(&s[regs[0].range.clone()], "$$ x = 5 $$");
659        assert!(matches!(
660            regs[0].span(),
661            MathSpan::Display {
662                delim: DisplayDelim::Dollar2,
663                ..
664            }
665        ));
666    }
667
668    #[test]
669    fn single_dollar_when_enabled() {
670        let s = "x is $a + b$";
671        let cfg = MathConfig {
672            single_dollar: true,
673            ..MathConfig::default()
674        };
675        let (regs, _) = scan_math_regions(s, &[], &[], cfg);
676        assert_eq!(regs.len(), 1);
677        assert_eq!(&s[regs[0].range.clone()], "$a + b$");
678        assert!(matches!(
679            regs[0].span(),
680            MathSpan::Inline {
681                delim: InlineDelim::Dollar,
682                ..
683            }
684        ));
685    }
686
687    #[test]
688    fn region_with_subscripts_and_emphasis_chars() {
689        let s = r"see \[ \pi_A:\Gamma.A\to \Gamma \] above";
690        let regs = regions(s);
691        assert_eq!(regs.len(), 1);
692        let span = &s[regs[0].range.clone()];
693        assert!(span.contains("_A"));
694        assert!(span.contains(r"\Gamma"));
695    }
696
697    #[test]
698    fn regions_dont_overlap_or_misorder() {
699        let s = r"\[ a \] mid \( b \) end \[ c \]";
700        let regs = regions(s);
701        assert_eq!(regs.len(), 3);
702        for w in regs.windows(2) {
703            assert!(w[0].range.end <= w[1].range.start);
704        }
705    }
706
707    #[test]
708    fn environment_basic() {
709        let s = "before \\begin{align} x &= y \\end{align} after";
710        let regs = regions(s);
711        assert_eq!(regs.len(), 1);
712        let span = &s[regs[0].range.clone()];
713        assert!(span.starts_with("\\begin{align}"));
714        assert!(span.ends_with("\\end{align}"));
715        match regs[0].span() {
716            MathSpan::Environment { env, body } => {
717                assert!(matches!(env, EnvKind::Known(KnownEnv::Align)));
718                assert!(body.as_str(s).contains("x &= y"));
719            }
720            MathSpan::Inline { .. } | MathSpan::Display { .. } => {
721                panic!("expected environment span")
722            }
723        }
724    }
725
726    #[test]
727    fn environment_nested_same_name() {
728        let s = "\\begin{matrix} a \\begin{matrix} b \\end{matrix} c \\end{matrix}";
729        let regs = regions(s);
730        assert_eq!(regs.len(), 1);
731        assert_eq!(&s[regs[0].range.clone()], s);
732    }
733
734    #[test]
735    fn environment_starred_name() {
736        let s = "\\begin{align*} x \\end{align*}";
737        let regs = regions(s);
738        assert_eq!(regs.len(), 1);
739        assert!(matches!(
740            regs[0].span(),
741            MathSpan::Environment {
742                env: EnvKind::Known(KnownEnv::AlignStar),
743                ..
744            }
745        ));
746    }
747
748    #[test]
749    fn environment_custom_name_round_trips() {
750        let s = "\\begin{widget} q \\end{widget}";
751        let regs = regions(s);
752        assert_eq!(regs.len(), 1);
753        match regs[0].span() {
754            MathSpan::Environment {
755                env: EnvKind::Custom(name_range),
756                ..
757            } => {
758                assert_eq!(&s[name_range.clone()], "widget");
759            }
760            MathSpan::Inline { .. }
761            | MathSpan::Display { .. }
762            | MathSpan::Environment {
763                env: EnvKind::Known(_), ..
764            } => {
765                panic!("expected custom env")
766            }
767        }
768    }
769
770    #[test]
771    fn environment_unbalanced_emits_error() {
772        let s = "\\begin{align} x = 1 \n";
773        let (regs, errs) = scan(s);
774        assert!(regs.is_empty());
775        assert_eq!(errs.len(), 1);
776        assert!(matches!(&errs[0], MathError::UnbalancedEnv { name, .. } if name == "align"));
777    }
778
779    #[test]
780    fn environment_inside_display_is_one_region() {
781        let s = "\\[ \\begin{aligned} a &= b \\end{aligned} \\]";
782        let regs = regions(s);
783        assert_eq!(regs.len(), 1);
784        // The outer region is Display (brackets); the inner aligned
785        // is part of the body, not its own top-level region.
786        assert!(matches!(
787            regs[0].span(),
788            MathSpan::Display {
789                delim: DisplayDelim::Bracket,
790                ..
791            }
792        ));
793    }
794
795    #[test]
796    fn brace_imbalance_emits_error_but_region_still_scans() {
797        let s = r"\[ \frac{a}{b \]";
798        let (regs, errs) = scan(s);
799        assert_eq!(regs.len(), 1);
800        assert!(errs.iter().any(|e| matches!(e, MathError::UnbalancedBraces { .. })));
801    }
802
803    #[test]
804    fn brace_balance_with_escaped_braces() {
805        let s = r"\[ \{ a \} \]";
806        let (_, errs) = scan(s);
807        assert!(
808            errs.iter().all(|e| !matches!(e, MathError::UnbalancedBraces { .. })),
809            "escaped braces should not count: {errs:?}"
810        );
811    }
812
813    #[test]
814    fn transparent_run_in_blockquote_strips_prefix() {
815        // `> $$\n> x = 1\n> $$` — the `> ` on lines 2 and 3 sits
816        // inside the math body and must be stripped from the clean
817        // content.
818        let s = "> $$\n> x = 1\n> $$";
819        // The `> ` prefix sits on each line. The math open is at
820        // byte 2 (`$$`). The body runs from byte 4 (after `$$`) to
821        // byte 15 (before the close `$$`). Transparent runs cover
822        // the `> ` prefixes on lines 2 and 3.
823        let runs = vec![5..7, 13..15];
824        let cfg = MathConfig {
825            double_dollar: true,
826            ..MathConfig::default()
827        };
828        let (regs, _) = scan_with_runs(s, &runs, cfg);
829        assert_eq!(regs.len(), 1, "expected one region in {s:?}");
830        let body = regs[0].span().body();
831        let clean = body.as_str(s);
832        assert!(
833            matches!(&clean, Cow::Owned(_)),
834            "expected owned body for container-nested math, got {clean:?}",
835        );
836        assert!(!clean.contains('>'), "container prefix leaked: {clean:?}");
837        assert!(clean.contains("x = 1"), "body lost content: {clean:?}");
838    }
839
840    #[test]
841    fn transparent_run_in_list_item_strips_indent() {
842        // `1. item\n   $$\n   x = 1\n   $$` — the 3-space
843        // continuation indent on lines 2-4 must be stripped from the
844        // clean content.
845        let s = "1. item\n   $$\n   x = 1\n   $$";
846        // Continuation indents at line 2 (bytes 8..11), line 3
847        // (14..17), line 4 (23..26).
848        let runs = vec![8..11, 14..17, 23..26];
849        let cfg = MathConfig {
850            double_dollar: true,
851            ..MathConfig::default()
852        };
853        let (regs, _) = scan_with_runs(s, &runs, cfg);
854        assert_eq!(regs.len(), 1);
855        let clean = regs[0].span().body().as_str(s);
856        assert!(matches!(&clean, Cow::Owned(_)));
857        assert!(!clean.contains("   "), "indent leaked: {clean:?}");
858        assert!(clean.contains("x = 1"));
859    }
860
861    #[test]
862    fn nested_blockquote_combined_prefix() {
863        // `> > $$\n> > x\n> > $$` — both nesting levels' `> `
864        // prefixes are combined into one transparent run per line.
865        let s = "> > $$\n> > x\n> > $$";
866        // Line 2 (`> > x`): bytes 7..11 → "> > " (4 bytes).
867        // Line 3 (`> > $$`): bytes 13..17 → "> > " (4 bytes).
868        let runs = vec![7..11, 13..17];
869        let cfg = MathConfig {
870            double_dollar: true,
871            ..MathConfig::default()
872        };
873        let (regs, _) = scan_with_runs(s, &runs, cfg);
874        assert_eq!(regs.len(), 1);
875        let clean = regs[0].span().body().as_str(s);
876        assert!(!clean.contains('>'), "prefix leaked: {clean:?}");
877        assert!(clean.contains('x'));
878    }
879
880    #[test]
881    fn top_level_math_borrows() {
882        // `$$\nx\n$$` at root with empty transparent runs: the body
883        // is borrowed from source, no allocation. Test the
884        // `Cow::Borrowed` discriminant explicitly so the fast path
885        // can't regress silently.
886        let s = "$$\nx\n$$";
887        let cfg = MathConfig {
888            double_dollar: true,
889            ..MathConfig::default()
890        };
891        let (regs, _) = scan_with_runs(s, &[], cfg);
892        assert_eq!(regs.len(), 1);
893        let clean = regs[0].span().body().as_str(s);
894        assert!(
895            matches!(clean, Cow::Borrowed(_)),
896            "expected borrowed body for top-level math",
897        );
898    }
899
900    #[test]
901    fn body_source_ranges_can_drive_latex_translation_without_markdown_parsing() {
902        let s = r"Inline \( \alpha_i \) and \[ x^{2} \].";
903        let regs = regions(s);
904        let ranges = regs
905            .iter()
906            .map(|region| region.span().body().source_range())
907            .collect::<Vec<_>>();
908
909        let translated = mdwright_latex::translate_latex_ranges_to_unicode(s, &ranges);
910
911        assert_eq!(translated.text(), r"Inline \( αᵢ \) and \[ x² \].");
912        assert_eq!(translated.edit_count(), 2);
913        assert!(translated.is_lossless());
914    }
915
916    #[test]
917    fn transparent_run_protects_delim_match() {
918        // `> $$ x\n> $$` — the close `$$` on line 2 is outside any
919        // transparent run; the region must still be recognised
920        // end-to-end with the body crossing the `\n> ` prefix.
921        let s = "> $$ x\n> $$";
922        let run = 7..9;
923        let runs = std::slice::from_ref(&run);
924        let cfg = MathConfig {
925            double_dollar: true,
926            ..MathConfig::default()
927        };
928        let (regs, _) = scan_with_runs(s, runs, cfg);
929        assert_eq!(regs.len(), 1, "expected one region in {s:?}");
930        // Close $$ is at bytes 9..11.
931        assert_eq!(regs[0].range.end, 11);
932    }
933
934    #[test]
935    fn transparent_run_blocks_spurious_delim() {
936        // `not math\n> $\n` with single-dollar enabled and the `> `
937        // recorded as a transparent run on line 2. The `$` after
938        // the prefix is at the end of the line and never sees a
939        // close — so the scanner records an UnbalancedDelim, not
940        // a recognised region. The point of the test: the lexer
941        // does NOT see a `$` inside the transparent prefix bytes
942        // themselves (no spurious region anchored at `>`).
943        let s = "not math\n> $\n";
944        let run = 9..11;
945        let runs = std::slice::from_ref(&run);
946        let cfg = MathConfig {
947            single_dollar: true,
948            ..MathConfig::default()
949        };
950        let (regs, errs) = scan_with_runs(s, runs, cfg);
951        assert!(regs.is_empty(), "no region should match in {s:?}");
952        assert!(
953            errs.iter().any(|e| matches!(e, MathError::UnbalancedDelim { .. })),
954            "expected an UnbalancedDelim for the unclosed `$`: {errs:?}",
955        );
956    }
957
958    fn dollar_cfg() -> MathConfig {
959        MathConfig {
960            single_dollar: true,
961            double_dollar: true,
962            ..MathConfig::default()
963        }
964    }
965
966    #[test]
967    fn symbol_only_dollar_body_is_recognised() {
968        // `$+$`, `$<$`, `$[-,-]$` are balanced inline math whose body has
969        // no alphanumeric. They must be recognised (one region, no error):
970        // the old guard skipped the opener and re-scanned the closing `$`
971        // as a fresh opener, producing a spurious UnbalancedDelim.
972        for s in ["a $+$ b", "a $<$ b", "a $[-,-]$ b"] {
973            let (regs, errs) = scan_with_runs(s, &[], dollar_cfg());
974            assert_eq!(regs.len(), 1, "expected one region in {s:?}");
975            assert!(errs.is_empty(), "unexpected errors in {s:?}: {errs:?}");
976            assert!(matches!(
977                regs[0].span(),
978                MathSpan::Inline {
979                    delim: InlineDelim::Dollar,
980                    ..
981                }
982            ));
983        }
984    }
985
986    #[test]
987    fn escaped_dollar_is_not_a_delimiter() {
988        // `\$` is a CommonMark literal dollar; a lone one is not math.
989        let s = r"a lone \$ sign";
990        let (regs, errs) = scan_with_runs(s, &[], dollar_cfg());
991        assert!(regs.is_empty(), "no region in {s:?}: {regs:?}");
992        assert!(errs.is_empty(), "no errors in {s:?}: {errs:?}");
993    }
994
995    #[test]
996    fn escaped_dollar_inside_body_does_not_close() {
997        // The mid-body `\$` is literal, so the region runs to the final
998        // unescaped `$`.
999        let s = r"x $a \$ b$ y";
1000        let (regs, errs) = scan_with_runs(s, &[], dollar_cfg());
1001        assert_eq!(regs.len(), 1, "expected one region in {s:?}");
1002        assert!(errs.is_empty(), "no errors in {s:?}: {errs:?}");
1003        assert_eq!(&s[regs[0].range.clone()], r"$a \$ b$");
1004    }
1005
1006    #[test]
1007    fn backslash_escape_noise_still_rejected() {
1008        // GFM §6.1 ex. 308: `\[\\\]` is escape noise, not math. The bracket
1009        // guard must still reject it without a spurious delim error.
1010        let s = r"x \[\\\] y";
1011        let (regs, errs) = scan(s);
1012        assert!(regs.is_empty(), "no region in {s:?}: {regs:?}");
1013        assert!(
1014            !errs.iter().any(|e| matches!(e, MathError::UnbalancedDelim { .. })),
1015            "no spurious delim error in {s:?}: {errs:?}",
1016        );
1017    }
1018}