Skip to main content

mdwright_math/
scan.rs

1//! Structural math recogniser.
2//!
3//! Walks `source` left-to-right with exclusion zones derived from the
4//! IR's inline / block atoms (code spans, code blocks, HTML blocks,
5//! inline HTML). Inside an exclusion the scanner skips ahead to the
6//! zone's end, so `$` inside `` `cost is $5` `` or `<a title="$x$">`
7//! cannot anchor a math region.
8//!
9//! Three opener families are recognised:
10//!
11//! - Delimited pairs: `\[ … \]`, `\( … \)`, `$$ … $$`, `$ … $`.
12//!   Greedy first-close matches the heuristic scanner's behaviour and
13//!   the way `KaTeX` / pandoc resolve these in practice.
14//! - LaTeX environments: `\begin{name} … \end{name}`. The recogniser
15//!   counts nested `\begin{name}` so an inner environment of the same
16//!   name does not close the outer.
17//!
18//! Each recognised region carries a [`MathSpan`] tag (inline, display,
19//! or environment) with the body byte range.
20//!
21//! Unmatched openers become [`MathError`] values without aborting the
22//! scan. Brace imbalance inside a recognised body is checked once per
23//! region and surfaces as [`MathError::UnbalancedBraces`]; the region
24//! still scans because its markers are balanced; canonicalisation
25//! skips body rewrites for that region.
26
27use std::ops::Range;
28
29use super::MathRegion;
30use super::env::{EnvKind, KnownEnv};
31use super::span::{AnyDelim, DisplayDelim, InlineDelim, MathBody, MathError, MathSpan};
32
33/// Which math delimiter pairs to recognise.
34///
35/// Defaults recognise `\(…\)`, `\[…\]`, and LaTeX environments. The
36/// dollar variants are opt-in because `$` collides with currency
37/// symbols and shell prompts in non-math prose.
38#[derive(Copy, Clone, Debug)]
39#[allow(clippy::struct_excessive_bools)]
40pub struct MathConfig {
41    pub backslash_bracket: bool,
42    pub backslash_paren: bool,
43    pub double_dollar: bool,
44    pub single_dollar: bool,
45    /// LaTeX `\begin{env}…\end{env}` recognition. Defaults to `true`:
46    /// environments outside `\[ \]` are common in mathematical prose
47    /// (e.g. raw `\begin{align}` blocks rendered by `KaTeX`) and have
48    /// unambiguous closers, unlike `$`.
49    pub environments: bool,
50}
51
52impl Default for MathConfig {
53    fn default() -> Self {
54        Self {
55            backslash_bracket: true,
56            backslash_paren: true,
57            double_dollar: false,
58            single_dollar: false,
59            environments: true,
60        }
61    }
62}
63
64/// Scan `source` for math regions. Returns regions in source order
65/// (non-overlapping) and any unmatched openers / brace-imbalanced
66/// bodies as errors.
67///
68/// `transparent_runs` is a sorted, non-overlapping slice of byte
69/// ranges the lexer must treat as if they do not exist — blockquote
70/// `>` markers and list-item continuation indentation on continuation
71/// lines. Math regions may cross transparent runs (they are not
72/// region boundaries the way exclusion zones are); the runs are
73/// recorded on each body so [`MathBody::as_str`] can yield clean
74/// math content.
75#[tracing::instrument(
76    level = "debug",
77    skip_all,
78    fields(len = source.len(), transparent = transparent_runs.len()),
79)]
80pub fn scan_math_regions(
81    source: &str,
82    exclusions: &[Range<usize>],
83    transparent_runs: &[Range<usize>],
84    cfg: MathConfig,
85) -> (Vec<MathRegion>, Vec<MathError>) {
86    let bytes = source.as_bytes();
87    let mut regions: Vec<MathRegion> = Vec::new();
88    let mut errors: Vec<MathError> = Vec::new();
89    let mut i = 0usize;
90    while i < bytes.len() {
91        if let Some(end) = excluded_end(exclusions, i) {
92            i = end;
93            continue;
94        }
95        if let Some(end) = transparent_end(transparent_runs, i) {
96            i = end;
97            continue;
98        }
99        // Environments first: `\begin{name}` is structurally
100        // unambiguous and would otherwise be passed over.
101        if cfg.environments
102            && let Some((env_name, name_range, after_begin)) = match_begin(source, bytes, i)
103        {
104            match find_end_env(source, bytes, after_begin, env_name, exclusions, transparent_runs) {
105                Some((end_start, end_after)) => {
106                    let region = i..end_after;
107                    let body_range = after_begin..end_start;
108                    let env = match KnownEnv::from_name(env_name) {
109                        Some(k) => EnvKind::Known(k),
110                        None => EnvKind::Custom(name_range),
111                    };
112                    let body = build_math_body(body_range.clone(), transparent_runs);
113                    record_brace_errors(source, &region, &body, &mut errors);
114                    let span = MathSpan::Environment { env, body };
115                    regions.push(MathRegion::new(region.clone(), span));
116                    tracing::debug!(
117                        env = env_name,
118                        range = ?region,
119                        stripped = !body_runs_empty(&body_range, transparent_runs),
120                        "env region",
121                    );
122                    i = end_after;
123                    continue;
124                }
125                None => {
126                    errors.push(MathError::UnbalancedEnv {
127                        name: env_name.to_string(),
128                        range: i..after_begin,
129                    });
130                    i = after_begin;
131                    continue;
132                }
133            }
134        }
135        let Some((delim, open_len)) = match_open(bytes, i, cfg) else {
136            i = i.saturating_add(1);
137            continue;
138        };
139        let content_start = i.saturating_add(open_len);
140        match find_close(bytes, content_start, delim, exclusions, transparent_runs) {
141            Some(close_start) => {
142                // Reject bodies with no alphanumeric content. Real
143                // math always carries a variable, constant, or
144                // command name. A `\(...\)` or `\[...\]` whose body
145                // is only backslashes, brackets, or whitespace is
146                // almost certainly a sequence of CM backslash escapes
147                // — GFM §6.1 ex. 308's `\!\"...\(\)...\[\\\]...` is
148                // the canonical case. Without this guard, the
149                // recogniser would treat `\(\)` (empty) and `\[\\\]`
150                // (body `\\`) as math, the formatter would normalise
151                // them, and the round-trip HTML would diverge from
152                // the source's escape-sequence rendering.
153                let body_slice = bytes.get(content_start..close_start).unwrap_or(&[]);
154                if !body_slice.iter().any(u8::is_ascii_alphanumeric) {
155                    i = i.saturating_add(1);
156                    continue;
157                }
158                let close_len = delim.close().len();
159                let region_end = close_start.saturating_add(close_len);
160                let region = i..region_end;
161                let body_range = content_start..close_start;
162                let body = build_math_body(body_range.clone(), transparent_runs);
163                record_brace_errors(source, &region, &body, &mut errors);
164                let span = match delim {
165                    AnyDelim::Paren => MathSpan::Inline {
166                        delim: InlineDelim::Paren,
167                        body,
168                    },
169                    AnyDelim::Dollar => MathSpan::Inline {
170                        delim: InlineDelim::Dollar,
171                        body,
172                    },
173                    AnyDelim::Bracket => MathSpan::Display {
174                        delim: DisplayDelim::Bracket,
175                        body,
176                    },
177                    AnyDelim::Dollar2 => MathSpan::Display {
178                        delim: DisplayDelim::Dollar2,
179                        body,
180                    },
181                };
182                regions.push(MathRegion::new(region.clone(), span));
183                tracing::debug!(
184                    delim = delim.open(),
185                    range = ?region,
186                    stripped = !body_runs_empty(&body_range, transparent_runs),
187                    "delim region",
188                );
189                i = region_end;
190            }
191            None => {
192                errors.push(MathError::UnbalancedDelim {
193                    delim,
194                    range: i..content_start,
195                });
196                i = content_start;
197            }
198        }
199    }
200    (regions, errors)
201}
202
203/// Collect the slice of transparent runs intersecting `body_range`
204/// and pack them into a [`MathBody`]. The recogniser keeps the
205/// invariant that `transparent_runs` is sorted and non-overlapping,
206/// so the intersection is contiguous.
207fn build_math_body(body_range: Range<usize>, transparent_runs: &[Range<usize>]) -> MathBody {
208    let runs: Box<[Range<usize>]> = transparent_runs
209        .iter()
210        .filter(|r| r.start < body_range.end && body_range.start < r.end)
211        .cloned()
212        .collect();
213    MathBody::new(body_range, runs)
214}
215
216/// True iff no transparent run intersects `body_range`. Cheap probe
217/// for the tracing debug field — the actual `Box<[Range]>` allocation
218/// happens once inside [`build_math_body`].
219fn body_runs_empty(body_range: &Range<usize>, transparent_runs: &[Range<usize>]) -> bool {
220    !transparent_runs
221        .iter()
222        .any(|r| r.start < body_range.end && body_range.start < r.end)
223}
224
225/// Push a `MathError::UnbalancedBraces` if `body`'s clean content has
226/// unbalanced `{` / `}`. Delegates to the shared validator in
227/// [`super::normalise::body_braces_balanced`] so the scanner, the
228/// canonicalise math rewrite, and the lint rule agree on what counts
229/// as balanced. The check runs over the clean body, so container
230/// prefixes cannot affect brace balance, and the local offset is
231/// mapped back to a source-absolute byte via
232/// [`MathBody::clean_offset_to_source`].
233fn record_brace_errors(source: &str, region: &Range<usize>, body: &MathBody, errors: &mut Vec<MathError>) {
234    let clean = body.as_str(source);
235    if let Err(local_offset) = super::normalise::body_braces_balanced(clean.as_ref()) {
236        errors.push(MathError::UnbalancedBraces {
237            offset: body.clean_offset_to_source(local_offset),
238            region: region.clone(),
239        });
240    }
241}
242
243fn excluded_end(exclusions: &[Range<usize>], i: usize) -> Option<usize> {
244    let idx = exclusions.partition_point(|r| r.start <= i);
245    if let Some(prev_idx) = idx.checked_sub(1)
246        && let Some(r) = exclusions.get(prev_idx)
247        && i < r.end
248    {
249        return Some(r.end);
250    }
251    None
252}
253
254/// True iff `i` lies inside any transparent run, returning the run's
255/// end. Mirrors [`excluded_end`] structurally but has different
256/// semantics: transparent runs do not bound math regions; the scanner
257/// and [`find_close`] / [`find_end_env`] use this to skip prefix
258/// bytes while keeping the surrounding region intact.
259fn transparent_end(transparent_runs: &[Range<usize>], i: usize) -> Option<usize> {
260    let idx = transparent_runs.partition_point(|r| r.start <= i);
261    if let Some(prev_idx) = idx.checked_sub(1)
262        && let Some(r) = transparent_runs.get(prev_idx)
263        && i < r.end
264    {
265        return Some(r.end);
266    }
267    None
268}
269
270/// Match `\begin{name}` at `i`. Returns `(name, byte range of the
271/// name, position after the closing `}`)`. The `\` must not be itself
272/// escaped (even-count of preceding backslashes).
273fn match_begin<'a>(source: &'a str, bytes: &[u8], i: usize) -> Option<(&'a str, Range<usize>, usize)> {
274    let after = match_kw(bytes, i, b"begin")?;
275    parse_env_name(source, after)
276}
277
278/// Match `\end{name}` at `j`. Returns `(name, byte range of the name,
279/// position after the closing `}`)`.
280fn match_end<'a>(source: &'a str, bytes: &[u8], j: usize) -> Option<(&'a str, Range<usize>, usize)> {
281    let after = match_kw(bytes, j, b"end")?;
282    parse_env_name(source, after)
283}
284
285/// Common prefix check for `\begin` / `\end`. Returns the position
286/// just after the keyword on success.
287fn match_kw(bytes: &[u8], i: usize, keyword: &[u8]) -> Option<usize> {
288    if bytes.get(i).copied() != Some(b'\\') {
289        return None;
290    }
291    if !preceding_backslashes_even(bytes, i) {
292        return None;
293    }
294    let kw_start = i.saturating_add(1);
295    let kw_end = kw_start.saturating_add(keyword.len());
296    if bytes.get(kw_start..kw_end) != Some(keyword) {
297        return None;
298    }
299    Some(kw_end)
300}
301
302/// Parse `{name}` starting at `after`, where `name` is `[A-Za-z]+\*?`
303/// (LaTeX environment name convention). Returns `(name, byte range of
304/// the name in `source`, position after the closing `}`)`.
305fn parse_env_name(source: &str, after: usize) -> Option<(&str, Range<usize>, usize)> {
306    let bytes = source.as_bytes();
307    if bytes.get(after).copied() != Some(b'{') {
308        return None;
309    }
310    let name_start = after.saturating_add(1);
311    let mut j = name_start;
312    while let Some(b) = bytes.get(j).copied() {
313        if b.is_ascii_alphabetic() {
314            j = j.saturating_add(1);
315        } else {
316            break;
317        }
318    }
319    // Optional trailing `*` for the unnumbered variants.
320    if bytes.get(j).copied() == Some(b'*') {
321        j = j.saturating_add(1);
322    }
323    if j == name_start {
324        return None;
325    }
326    if bytes.get(j).copied() != Some(b'}') {
327        return None;
328    }
329    let name = source.get(name_start..j)?;
330    Some((name, name_start..j, j.saturating_add(1)))
331}
332
333/// Find the matching `\end{name}` for an open environment. Returns
334/// the byte index of the `\` of `\end` and the byte index just after
335/// the closing `}` of `\end{name}`. Counts nested `\begin{name}` so
336/// inner environments of the same name do not close the outer.
337fn find_end_env(
338    source: &str,
339    bytes: &[u8],
340    from: usize,
341    name: &str,
342    exclusions: &[Range<usize>],
343    transparent_runs: &[Range<usize>],
344) -> Option<(usize, usize)> {
345    let mut depth: u32 = 1;
346    let mut j = from;
347    while j < bytes.len() {
348        if let Some(end) = excluded_end(exclusions, j) {
349            j = end;
350            continue;
351        }
352        if let Some(end) = transparent_end(transparent_runs, j) {
353            j = end;
354            continue;
355        }
356        if let Some((found_name, _, after)) = match_end(source, bytes, j) {
357            if found_name == name {
358                depth = depth.saturating_sub(1);
359                if depth == 0 {
360                    return Some((j, after));
361                }
362            }
363            j = after;
364            continue;
365        }
366        if let Some((found_name, _, after)) = match_begin(source, bytes, j) {
367            if found_name == name {
368                depth = depth.saturating_add(1);
369            }
370            j = after;
371            continue;
372        }
373        j = j.saturating_add(1);
374    }
375    None
376}
377
378/// Match a primitive delimiter opener at position `i`. Returns the
379/// matched delimiter and the byte length of the open token.
380fn match_open(bytes: &[u8], i: usize, cfg: MathConfig) -> Option<(AnyDelim, usize)> {
381    let b = bytes.get(i).copied()?;
382    match b {
383        b'\\' => {
384            if !preceding_backslashes_even(bytes, i) {
385                return None;
386            }
387            let next = bytes.get(i.saturating_add(1)).copied()?;
388            match next {
389                b'[' if cfg.backslash_bracket => Some((AnyDelim::Bracket, 2)),
390                b'(' if cfg.backslash_paren => Some((AnyDelim::Paren, 2)),
391                _ => None,
392            }
393        }
394        b'$' => {
395            let two = bytes.get(i.saturating_add(1)).copied();
396            if cfg.double_dollar && two == Some(b'$') {
397                Some((AnyDelim::Dollar2, 2))
398            } else if cfg.single_dollar {
399                Some((AnyDelim::Dollar, 1))
400            } else {
401                None
402            }
403        }
404        _ => None,
405    }
406}
407
408/// Count the run of `\` bytes ending immediately before `i` and
409/// return true iff the count is even (so `bytes[i]` itself starts a
410/// fresh, unescaped sequence).
411fn preceding_backslashes_even(bytes: &[u8], i: usize) -> bool {
412    let mut j = i;
413    let mut count = 0usize;
414    while j > 0 {
415        let prev = j.saturating_sub(1);
416        if bytes.get(prev).copied() == Some(b'\\') {
417            count = count.saturating_add(1);
418            j = prev;
419        } else {
420            break;
421        }
422    }
423    count.is_multiple_of(2)
424}
425
426/// Search for the matching close delimiter starting at `from`.
427fn find_close(
428    bytes: &[u8],
429    from: usize,
430    delim: AnyDelim,
431    exclusions: &[Range<usize>],
432    transparent_runs: &[Range<usize>],
433) -> Option<usize> {
434    let mut j = from;
435    while j < bytes.len() {
436        if excluded_end(exclusions, j).is_some() {
437            // Math regions don't cross an exclusion boundary.
438            return None;
439        }
440        if let Some(end) = transparent_end(transparent_runs, j) {
441            // Transparent bytes (container prefixes) are not part of
442            // the math content; skip past them and keep looking for
443            // the close.
444            j = end;
445            continue;
446        }
447        match delim {
448            AnyDelim::Bracket | AnyDelim::Paren => {
449                if bytes.get(j).copied() == Some(b'\\')
450                    && bytes.get(j.saturating_add(1)).copied() == Some(close_target_byte(delim))
451                    && preceding_backslashes_even(bytes, j)
452                {
453                    return Some(j);
454                }
455            }
456            AnyDelim::Dollar2 => {
457                if bytes.get(j).copied() == Some(b'$') && bytes.get(j.saturating_add(1)).copied() == Some(b'$') {
458                    return Some(j);
459                }
460            }
461            AnyDelim::Dollar => {
462                if bytes.get(j).copied() == Some(b'$') {
463                    return Some(j);
464                }
465            }
466        }
467        j = j.saturating_add(1);
468    }
469    None
470}
471
472const fn close_target_byte(delim: AnyDelim) -> u8 {
473    match delim {
474        AnyDelim::Bracket => b']',
475        AnyDelim::Paren => b')',
476        AnyDelim::Dollar2 | AnyDelim::Dollar => b'$',
477    }
478}
479
480#[cfg(test)]
481#[allow(clippy::indexing_slicing, clippy::panic)]
482mod tests {
483    use std::borrow::Cow;
484
485    use super::*;
486
487    fn scan(source: &str) -> (Vec<MathRegion>, Vec<MathError>) {
488        scan_math_regions(source, &[], &[], MathConfig::default())
489    }
490
491    fn scan_with_runs(
492        source: &str,
493        transparent_runs: &[Range<usize>],
494        cfg: MathConfig,
495    ) -> (Vec<MathRegion>, Vec<MathError>) {
496        scan_math_regions(source, &[], transparent_runs, cfg)
497    }
498
499    fn regions(source: &str) -> Vec<MathRegion> {
500        scan(source).0
501    }
502
503    #[test]
504    fn display_math_single_line() {
505        let s = r"prefix \[ A \] suffix";
506        let regs = regions(s);
507        assert_eq!(regs.len(), 1);
508        assert_eq!(&s[regs[0].range.clone()], r"\[ A \]");
509        assert!(matches!(
510            regs[0].span(),
511            MathSpan::Display {
512                delim: DisplayDelim::Bracket,
513                ..
514            }
515        ));
516    }
517
518    #[test]
519    fn display_math_multi_line() {
520        let s = "before \\[\n  A \\to B\n\\] after";
521        let regs = regions(s);
522        assert_eq!(regs.len(), 1);
523        let span = &s[regs[0].range.clone()];
524        assert!(span.starts_with(r"\["));
525        assert!(span.ends_with(r"\]"));
526    }
527
528    #[test]
529    fn inline_math_paren() {
530        let s = r"x is \( a + b \) units";
531        let regs = regions(s);
532        assert_eq!(regs.len(), 1);
533        assert_eq!(&s[regs[0].range.clone()], r"\( a + b \)");
534        assert!(matches!(
535            regs[0].span(),
536            MathSpan::Inline {
537                delim: InlineDelim::Paren,
538                ..
539            }
540        ));
541    }
542
543    #[test]
544    fn two_separate_regions() {
545        let s = r"see \[ A \] and \[ B \] both";
546        let regs = regions(s);
547        assert_eq!(regs.len(), 2);
548        assert!(regs[0].range.end <= regs[1].range.start);
549    }
550
551    #[test]
552    fn unbalanced_open_drops_region_and_emits_error() {
553        let s = r"start \[ no close here";
554        let (regs, errs) = scan(s);
555        assert!(regs.is_empty());
556        assert_eq!(errs.len(), 1);
557        match &errs[0] {
558            MathError::UnbalancedDelim { delim, .. } => {
559                assert!(delim.is_display());
560                assert_eq!(delim.open(), r"\[");
561                assert_eq!(delim.close(), r"\]");
562            }
563            MathError::UnbalancedEnv { .. } | MathError::UnbalancedBraces { .. } => {
564                panic!("expected delim error")
565            }
566        }
567    }
568
569    #[test]
570    fn greedy_first_close() {
571        let s = r"\[ a \[ b \] c \]";
572        let regs = regions(s);
573        assert_eq!(regs.len(), 1);
574        assert_eq!(&s[regs[0].range.clone()], r"\[ a \[ b \]");
575    }
576
577    #[test]
578    fn double_backslash_open_is_not_math() {
579        let s = r"foo \\[ not math \] bar";
580        assert!(regions(s).is_empty());
581    }
582
583    #[test]
584    fn triple_backslash_open_is_math() {
585        let s = r"foo \\\[ A \] bar";
586        assert_eq!(regions(s).len(), 1);
587    }
588
589    #[test]
590    #[allow(
591        clippy::single_range_in_vec_init,
592        reason = "test intentionally passes one exclusion range"
593    )]
594    fn region_inside_code_span_excluded() {
595        let s = r"text `\[ x \]` more";
596        let exclusions = [5..14];
597        let (regs, _) = scan_math_regions(s, &exclusions, &[], MathConfig::default());
598        assert!(regs.is_empty());
599    }
600
601    #[test]
602    #[allow(
603        clippy::single_range_in_vec_init,
604        reason = "test intentionally passes one exclusion range"
605    )]
606    fn region_inside_code_block_excluded() {
607        let s = "```\n\\[ x \\]\n```";
608        let exclusions = [0..s.len()];
609        let (regs, _) = scan_math_regions(s, &exclusions, &[], MathConfig::default());
610        assert!(regs.is_empty());
611    }
612
613    #[test]
614    #[allow(
615        clippy::single_range_in_vec_init,
616        reason = "test intentionally passes one exclusion range"
617    )]
618    fn region_inside_inline_html_excluded() {
619        let s = r#"see <a href="/x?val=$foo">x</a> after"#;
620        let exclusions = [4..26];
621        let cfg = MathConfig {
622            single_dollar: true,
623            ..MathConfig::default()
624        };
625        let (regs, _) = scan_math_regions(s, &exclusions, &[], cfg);
626        assert!(regs.is_empty());
627    }
628
629    #[test]
630    fn dollar_variants_off_by_default() {
631        let s = "value is $5 today, plus $$2 tomorrow";
632        assert!(regions(s).is_empty());
633    }
634
635    #[test]
636    fn double_dollar_when_enabled() {
637        let s = "see $$ x = 5 $$ above";
638        let cfg = MathConfig {
639            double_dollar: true,
640            ..MathConfig::default()
641        };
642        let (regs, _) = scan_math_regions(s, &[], &[], cfg);
643        assert_eq!(regs.len(), 1);
644        assert_eq!(&s[regs[0].range.clone()], "$$ x = 5 $$");
645        assert!(matches!(
646            regs[0].span(),
647            MathSpan::Display {
648                delim: DisplayDelim::Dollar2,
649                ..
650            }
651        ));
652    }
653
654    #[test]
655    fn single_dollar_when_enabled() {
656        let s = "x is $a + b$";
657        let cfg = MathConfig {
658            single_dollar: true,
659            ..MathConfig::default()
660        };
661        let (regs, _) = scan_math_regions(s, &[], &[], cfg);
662        assert_eq!(regs.len(), 1);
663        assert_eq!(&s[regs[0].range.clone()], "$a + b$");
664        assert!(matches!(
665            regs[0].span(),
666            MathSpan::Inline {
667                delim: InlineDelim::Dollar,
668                ..
669            }
670        ));
671    }
672
673    #[test]
674    fn region_with_subscripts_and_emphasis_chars() {
675        let s = r"see \[ \pi_A:\Gamma.A\to \Gamma \] above";
676        let regs = regions(s);
677        assert_eq!(regs.len(), 1);
678        let span = &s[regs[0].range.clone()];
679        assert!(span.contains("_A"));
680        assert!(span.contains(r"\Gamma"));
681    }
682
683    #[test]
684    fn regions_dont_overlap_or_misorder() {
685        let s = r"\[ a \] mid \( b \) end \[ c \]";
686        let regs = regions(s);
687        assert_eq!(regs.len(), 3);
688        for w in regs.windows(2) {
689            assert!(w[0].range.end <= w[1].range.start);
690        }
691    }
692
693    #[test]
694    fn environment_basic() {
695        let s = "before \\begin{align} x &= y \\end{align} after";
696        let regs = regions(s);
697        assert_eq!(regs.len(), 1);
698        let span = &s[regs[0].range.clone()];
699        assert!(span.starts_with("\\begin{align}"));
700        assert!(span.ends_with("\\end{align}"));
701        match regs[0].span() {
702            MathSpan::Environment { env, body } => {
703                assert!(matches!(env, EnvKind::Known(KnownEnv::Align)));
704                assert!(body.as_str(s).contains("x &= y"));
705            }
706            MathSpan::Inline { .. } | MathSpan::Display { .. } => {
707                panic!("expected environment span")
708            }
709        }
710    }
711
712    #[test]
713    fn environment_nested_same_name() {
714        let s = "\\begin{matrix} a \\begin{matrix} b \\end{matrix} c \\end{matrix}";
715        let regs = regions(s);
716        assert_eq!(regs.len(), 1);
717        assert_eq!(&s[regs[0].range.clone()], s);
718    }
719
720    #[test]
721    fn environment_starred_name() {
722        let s = "\\begin{align*} x \\end{align*}";
723        let regs = regions(s);
724        assert_eq!(regs.len(), 1);
725        assert!(matches!(
726            regs[0].span(),
727            MathSpan::Environment {
728                env: EnvKind::Known(KnownEnv::AlignStar),
729                ..
730            }
731        ));
732    }
733
734    #[test]
735    fn environment_custom_name_round_trips() {
736        let s = "\\begin{widget} q \\end{widget}";
737        let regs = regions(s);
738        assert_eq!(regs.len(), 1);
739        match regs[0].span() {
740            MathSpan::Environment {
741                env: EnvKind::Custom(name_range),
742                ..
743            } => {
744                assert_eq!(&s[name_range.clone()], "widget");
745            }
746            MathSpan::Inline { .. }
747            | MathSpan::Display { .. }
748            | MathSpan::Environment {
749                env: EnvKind::Known(_), ..
750            } => {
751                panic!("expected custom env")
752            }
753        }
754    }
755
756    #[test]
757    fn environment_unbalanced_emits_error() {
758        let s = "\\begin{align} x = 1 \n";
759        let (regs, errs) = scan(s);
760        assert!(regs.is_empty());
761        assert_eq!(errs.len(), 1);
762        assert!(matches!(&errs[0], MathError::UnbalancedEnv { name, .. } if name == "align"));
763    }
764
765    #[test]
766    fn environment_inside_display_is_one_region() {
767        let s = "\\[ \\begin{aligned} a &= b \\end{aligned} \\]";
768        let regs = regions(s);
769        assert_eq!(regs.len(), 1);
770        // The outer region is Display (brackets); the inner aligned
771        // is part of the body, not its own top-level region.
772        assert!(matches!(
773            regs[0].span(),
774            MathSpan::Display {
775                delim: DisplayDelim::Bracket,
776                ..
777            }
778        ));
779    }
780
781    #[test]
782    fn brace_imbalance_emits_error_but_region_still_scans() {
783        let s = r"\[ \frac{a}{b \]";
784        let (regs, errs) = scan(s);
785        assert_eq!(regs.len(), 1);
786        assert!(errs.iter().any(|e| matches!(e, MathError::UnbalancedBraces { .. })));
787    }
788
789    #[test]
790    fn brace_balance_with_escaped_braces() {
791        let s = r"\[ \{ a \} \]";
792        let (_, errs) = scan(s);
793        assert!(
794            errs.iter().all(|e| !matches!(e, MathError::UnbalancedBraces { .. })),
795            "escaped braces should not count: {errs:?}"
796        );
797    }
798
799    #[test]
800    fn transparent_run_in_blockquote_strips_prefix() {
801        // `> $$\n> x = 1\n> $$` — the `> ` on lines 2 and 3 sits
802        // inside the math body and must be stripped from the clean
803        // content.
804        let s = "> $$\n> x = 1\n> $$";
805        // The `> ` prefix sits on each line. The math open is at
806        // byte 2 (`$$`). The body runs from byte 4 (after `$$`) to
807        // byte 15 (before the close `$$`). Transparent runs cover
808        // the `> ` prefixes on lines 2 and 3.
809        let runs = vec![5..7, 13..15];
810        let cfg = MathConfig {
811            double_dollar: true,
812            ..MathConfig::default()
813        };
814        let (regs, _) = scan_with_runs(s, &runs, cfg);
815        assert_eq!(regs.len(), 1, "expected one region in {s:?}");
816        let body = regs[0].span().body();
817        let clean = body.as_str(s);
818        assert!(
819            matches!(&clean, Cow::Owned(_)),
820            "expected owned body for container-nested math, got {clean:?}",
821        );
822        assert!(!clean.contains('>'), "container prefix leaked: {clean:?}");
823        assert!(clean.contains("x = 1"), "body lost content: {clean:?}");
824    }
825
826    #[test]
827    fn transparent_run_in_list_item_strips_indent() {
828        // `1. item\n   $$\n   x = 1\n   $$` — the 3-space
829        // continuation indent on lines 2-4 must be stripped from the
830        // clean content.
831        let s = "1. item\n   $$\n   x = 1\n   $$";
832        // Continuation indents at line 2 (bytes 8..11), line 3
833        // (14..17), line 4 (23..26).
834        let runs = vec![8..11, 14..17, 23..26];
835        let cfg = MathConfig {
836            double_dollar: true,
837            ..MathConfig::default()
838        };
839        let (regs, _) = scan_with_runs(s, &runs, cfg);
840        assert_eq!(regs.len(), 1);
841        let clean = regs[0].span().body().as_str(s);
842        assert!(matches!(&clean, Cow::Owned(_)));
843        assert!(!clean.contains("   "), "indent leaked: {clean:?}");
844        assert!(clean.contains("x = 1"));
845    }
846
847    #[test]
848    fn nested_blockquote_combined_prefix() {
849        // `> > $$\n> > x\n> > $$` — both nesting levels' `> `
850        // prefixes are combined into one transparent run per line.
851        let s = "> > $$\n> > x\n> > $$";
852        // Line 2 (`> > x`): bytes 7..11 → "> > " (4 bytes).
853        // Line 3 (`> > $$`): bytes 13..17 → "> > " (4 bytes).
854        let runs = vec![7..11, 13..17];
855        let cfg = MathConfig {
856            double_dollar: true,
857            ..MathConfig::default()
858        };
859        let (regs, _) = scan_with_runs(s, &runs, cfg);
860        assert_eq!(regs.len(), 1);
861        let clean = regs[0].span().body().as_str(s);
862        assert!(!clean.contains('>'), "prefix leaked: {clean:?}");
863        assert!(clean.contains('x'));
864    }
865
866    #[test]
867    fn top_level_math_borrows() {
868        // `$$\nx\n$$` at root with empty transparent runs: the body
869        // is borrowed from source, no allocation. Test the
870        // `Cow::Borrowed` discriminant explicitly so the fast path
871        // can't regress silently.
872        let s = "$$\nx\n$$";
873        let cfg = MathConfig {
874            double_dollar: true,
875            ..MathConfig::default()
876        };
877        let (regs, _) = scan_with_runs(s, &[], cfg);
878        assert_eq!(regs.len(), 1);
879        let clean = regs[0].span().body().as_str(s);
880        assert!(
881            matches!(clean, Cow::Borrowed(_)),
882            "expected borrowed body for top-level math",
883        );
884    }
885
886    #[test]
887    fn body_source_ranges_can_drive_latex_translation_without_markdown_parsing() {
888        let s = r"Inline \( \alpha_i \) and \[ x^{2} \].";
889        let regs = regions(s);
890        let ranges = regs
891            .iter()
892            .map(|region| region.span().body().source_range())
893            .collect::<Vec<_>>();
894
895        let translated = mdwright_latex::translate_latex_ranges_to_unicode(s, &ranges);
896
897        assert_eq!(translated.text(), r"Inline \( αᵢ \) and \[ x² \].");
898        assert_eq!(translated.edit_count(), 2);
899        assert!(translated.is_lossless());
900    }
901
902    #[test]
903    fn transparent_run_protects_delim_match() {
904        // `> $$ x\n> $$` — the close `$$` on line 2 is outside any
905        // transparent run; the region must still be recognised
906        // end-to-end with the body crossing the `\n> ` prefix.
907        let s = "> $$ x\n> $$";
908        let run = 7..9;
909        let runs = std::slice::from_ref(&run);
910        let cfg = MathConfig {
911            double_dollar: true,
912            ..MathConfig::default()
913        };
914        let (regs, _) = scan_with_runs(s, runs, cfg);
915        assert_eq!(regs.len(), 1, "expected one region in {s:?}");
916        // Close $$ is at bytes 9..11.
917        assert_eq!(regs[0].range.end, 11);
918    }
919
920    #[test]
921    fn transparent_run_blocks_spurious_delim() {
922        // `not math\n> $\n` with single-dollar enabled and the `> `
923        // recorded as a transparent run on line 2. The `$` after
924        // the prefix is at the end of the line and never sees a
925        // close — so the scanner records an UnbalancedDelim, not
926        // a recognised region. The point of the test: the lexer
927        // does NOT see a `$` inside the transparent prefix bytes
928        // themselves (no spurious region anchored at `>`).
929        let s = "not math\n> $\n";
930        let run = 9..11;
931        let runs = std::slice::from_ref(&run);
932        let cfg = MathConfig {
933            single_dollar: true,
934            ..MathConfig::default()
935        };
936        let (regs, errs) = scan_with_runs(s, runs, cfg);
937        assert!(regs.is_empty(), "no region should match in {s:?}");
938        assert!(
939            errs.iter().any(|e| matches!(e, MathError::UnbalancedDelim { .. })),
940            "expected an UnbalancedDelim for the unclosed `$`: {errs:?}",
941        );
942    }
943}