rumdl_lib/utils/
pandoc.rs

1//! Pandoc Markdown syntax detection.
2//!
3//! This module provides detection for Pandoc Markdown constructs that affect
4//! rumdl rule output: fenced divs (`:::`), attribute lists (`{#id .class}`),
5//! citations (`[@key]`), bracketed spans (`[text]{.class}`), and other
6//! Pandoc-specific syntax.
7//!
8//! Pandoc is the foundation; the Quarto flavor extends it with Quarto-only
9//! syntax (executable code blocks, shortcodes, cell options) elsewhere in
10//! the codebase. Anything that's pure Pandoc lives here.
11//!
12//! Common patterns this module handles:
13//! - `::: {.callout-note}` — fenced div with class
14//! - `::: {#myid .class}` — generic div with id and class
15//! - `:::` — closing marker
16//! - `{#id .class key="value"}` — Pandoc attribute lists
17//! - `@key`, `[@key]`, `[-@key]`, `[@a; @b]` — citations
18
19use regex::Regex;
20use std::sync::LazyLock;
21
22use crate::utils::skip_context::ByteRange;
23
24/// Pattern to match div opening markers
25/// Matches: ::: {.class}, ::: {#id .class}, ::: classname, etc.
26/// Does NOT match a closing ::: on its own
27static DIV_OPEN_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*):::\s*(?:\{[^}]+\}|\S+)").unwrap());
28
29/// Pattern to match div closing markers
30/// Matches: ::: (with optional whitespace before and after)
31static DIV_CLOSE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*):::\s*$").unwrap());
32
33/// Pattern to match callout blocks specifically
34/// Callout types: note, warning, tip, important, caution
35static CALLOUT_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
36    Regex::new(r"^(\s*):::\s*\{[^}]*\.callout-(?:note|warning|tip|important|caution)[^}]*\}").unwrap()
37});
38
39/// Pattern to match Pandoc-style attributes on any element
40/// Matches: {#id}, {.class}, {#id .class key="value"}, etc.
41/// Note: We match the entire attribute block including contents
42static PANDOC_ATTR_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\{[^}]+\}").unwrap());
43
44/// Check if a line is a div opening marker
45pub fn is_div_open(line: &str) -> bool {
46    DIV_OPEN_PATTERN.is_match(line)
47}
48
49/// Check if a line is a div closing marker (just `:::`)
50pub fn is_div_close(line: &str) -> bool {
51    DIV_CLOSE_PATTERN.is_match(line)
52}
53
54/// Check if a line is a callout block opening
55pub fn is_callout_open(line: &str) -> bool {
56    CALLOUT_PATTERN.is_match(line)
57}
58
59/// Check if a line contains Pandoc-style attributes
60pub fn has_pandoc_attributes(line: &str) -> bool {
61    PANDOC_ATTR_PATTERN.is_match(line)
62}
63
64/// Return true if `lang` is a Pandoc raw-format declaration: `{=html}`,
65/// `{=latex}`, etc. The format name must be non-empty and consist only of
66/// ASCII alphanumeric characters, underscores, or hyphens.
67pub fn is_pandoc_raw_block_lang(lang: &str) -> bool {
68    let l = lang.trim();
69    l.starts_with("{=") && l.ends_with('}') && {
70        let inner = &l[2..l.len() - 1];
71        !inner.trim().is_empty() && inner.chars().all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-')
72    }
73}
74
75/// Return true if `lang` is a Pandoc code-attribute language declaration: a
76/// brace-delimited attribute list containing at least one `.class`, e.g.
77/// `{.python}`, `{.haskell .numberLines}`, `{#snippet .python startFrom="10"}`.
78///
79/// Pandoc treats the first `.class` inside the attribute block as the language
80/// for syntax highlighting. Tokens are space-separated; a `.class` token is one
81/// that starts with `.` followed by a non-empty identifier.
82pub fn is_pandoc_code_class_attr(lang: &str) -> bool {
83    let l = lang.trim();
84    if !l.starts_with('{') || !l.ends_with('}') || l.len() < 2 {
85        return false;
86    }
87    let inner = &l[1..l.len() - 1];
88    inner.split_whitespace().any(|tok| {
89        tok.len() > 1
90            && tok.starts_with('.')
91            && tok[1..]
92                .chars()
93                .all(|c| c.is_ascii_alphanumeric() || c == '_' || c == '-')
94    })
95}
96
97/// Get the indentation level of a div marker
98pub fn get_div_indent(line: &str) -> usize {
99    let mut indent = 0;
100    for c in line.chars() {
101        match c {
102            ' ' => indent += 1,
103            '\t' => indent += 4, // Tabs expand to 4 spaces (CommonMark)
104            _ => break,
105        }
106    }
107    indent
108}
109
110/// Track div nesting state for a document
111#[derive(Debug, Clone, Default)]
112pub struct DivTracker {
113    /// Stack of div indentation levels for nesting tracking
114    indent_stack: Vec<usize>,
115}
116
117impl DivTracker {
118    pub fn new() -> Self {
119        Self::default()
120    }
121
122    /// Process a line and return whether we're inside a div after processing
123    pub fn process_line(&mut self, line: &str) -> bool {
124        let trimmed = line.trim_start();
125
126        if trimmed.starts_with(":::") {
127            let indent = get_div_indent(line);
128
129            if is_div_close(line) {
130                // Closing marker - pop the matching div from stack
131                // Pop the top div if its indent is >= the closing marker's indent
132                if let Some(&top_indent) = self.indent_stack.last()
133                    && top_indent >= indent
134                {
135                    self.indent_stack.pop();
136                }
137            } else if is_div_open(line) {
138                // Opening marker - push to stack
139                self.indent_stack.push(indent);
140            }
141        }
142
143        !self.indent_stack.is_empty()
144    }
145
146    /// Check if we're currently inside a div
147    pub fn is_inside_div(&self) -> bool {
148        !self.indent_stack.is_empty()
149    }
150}
151
152/// Detect fenced div block ranges in content.
153/// Returns a vector of byte ranges (start, end) for each div block.
154pub fn detect_div_block_ranges(content: &str) -> Vec<ByteRange> {
155    let mut ranges = Vec::new();
156    let mut tracker = DivTracker::new();
157    let mut div_start: Option<usize> = None;
158    let mut byte_offset = 0;
159
160    for line in content.lines() {
161        let line_len = line.len();
162        let was_inside = tracker.is_inside_div();
163        let is_inside = tracker.process_line(line);
164
165        // Started a new div block
166        if !was_inside && is_inside {
167            div_start = Some(byte_offset);
168        }
169        // Exited a div block
170        else if was_inside
171            && !is_inside
172            && let Some(start) = div_start.take()
173        {
174            // End at the start of the closing line
175            ranges.push(ByteRange {
176                start,
177                end: byte_offset + line_len,
178            });
179        }
180
181        // Account for newline
182        byte_offset += line_len + 1;
183    }
184
185    // Handle unclosed divs at end of document
186    if let Some(start) = div_start {
187        ranges.push(ByteRange {
188            start,
189            end: content.len(),
190        });
191    }
192
193    ranges
194}
195
196/// Check if a byte position is within a div block
197pub fn is_within_div_block_ranges(ranges: &[ByteRange], position: usize) -> bool {
198    ranges.iter().any(|r| position >= r.start && position < r.end)
199}
200
201// ============================================================================
202// Citation Support
203// ============================================================================
204//
205// Pandoc citation syntax:
206// - Inline citation: @smith2020
207// - Parenthetical citation: [@smith2020]
208// - Suppress author: [-@smith2020]
209// - With locator: [@smith2020, p. 10]
210// - Multiple citations: [@smith2020; @jones2021]
211// - With prefix: [see @smith2020]
212//
213// Citation keys must start with a letter, digit, or underscore, and may contain
214// alphanumerics, underscores, hyphens, periods, and colons.
215
216/// Pattern to match bracketed citations: [@key], [-@key], [see @key], [@a; @b]
217///
218/// The `@` must sit at a citation boundary: immediately after `[`, or after a
219/// non-word character such as whitespace, `-`, `;`, or `,`. This excludes
220/// word-embedded `@` (e.g. emails or handles in link text like
221/// `[contact user@example.com](url)`), which are not citations.
222static BRACKETED_CITATION_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
223    Regex::new(r"\[(?:[^\]@]*[^A-Za-z0-9_])?@[a-zA-Z0-9_][a-zA-Z0-9_:.#$%&\-+?<>~/]*[^\]]*\]").unwrap()
224});
225
226/// Pattern to match inline citations: @key (not inside brackets)
227/// Citation key: starts with letter/digit/underscore, contains alphanumerics and some punctuation
228/// The @ must be preceded by whitespace, start of line, or punctuation (not alphanumeric)
229static INLINE_CITATION_PATTERN: LazyLock<Regex> = LazyLock::new(|| {
230    // Match @ at start of string, after whitespace, or after non-alphanumeric (except @[)
231    Regex::new(r"(?:^|[\s\(\[\{,;:])(@[a-zA-Z0-9_][a-zA-Z0-9_:.#$%&\-+?<>~/]*)").unwrap()
232});
233
234/// Pattern to match the bracketed text portion of a Markdown link.
235///
236/// Matches `[...]` that is *immediately* followed by `(` (inline link) or
237/// `[` (reference link). Capture group 1 is the bracket span, including the
238/// surrounding `[` and `]`. Used by citation detection to exclude `@key`
239/// occurrences appearing inside link labels.
240static LINK_LABEL_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(\[[^\]]*\])(?:\(|\[)").unwrap());
241
242/// Quick check if text might contain citations
243#[inline]
244pub fn has_citations(text: &str) -> bool {
245    text.contains('@')
246}
247
248// ============================================================================
249// Inline Footnote Support
250// ============================================================================
251//
252// Pandoc inline footnote syntax: ^[footnote text]
253//
254// The `^` must not be preceded by `!` (image) or by a word character
255// (superscript syntax: `2^10^`). The footnote body extends to the first
256// unescaped `]`; nested brackets are not supported in this detector.
257
258/// Pattern for Pandoc inline footnotes: `^[note text]`.
259/// The `^` must not be preceded by `!` (which would be an image) or by
260/// alphanumeric (which would be a superscript: `2^10^`).
261static INLINE_FOOTNOTE_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?:^|[^\w!])(\^\[[^\]]*\])").unwrap());
262
263/// Compute the Pandoc-style slug for a heading text.
264///
265/// Pandoc's `auto_identifiers` extension:
266/// 1. Remove all formatting, links, etc.
267/// 2. Remove all footnotes.
268/// 3. Remove all non-alphanumeric characters except `_`, `-`, `.`.
269/// 4. Replace all spaces with `-`.
270/// 5. Lowercase letters.
271/// 6. If nothing remains, use `section`.
272pub fn pandoc_header_slug(text: &str) -> String {
273    let mut s = String::with_capacity(text.len());
274    for c in text.chars() {
275        if c.is_alphanumeric() || c == '_' || c == '-' || c == '.' {
276            for lc in c.to_lowercase() {
277                s.push(lc);
278            }
279        } else if c.is_whitespace() {
280            // Collapse runs of whitespace to a single `-`.
281            if !s.ends_with('-') {
282                s.push('-');
283            }
284        }
285        // Drop other punctuation entirely.
286    }
287    let trimmed = s.trim_matches('-').to_string();
288    if trimmed.is_empty() {
289        "section".to_string()
290    } else {
291        trimmed
292    }
293}
294
295/// Find headings in the document and return a set of their Pandoc slugs.
296///
297/// Scans ATX-style headings (lines beginning with one or more `#`) and computes
298/// a slug for each using [`pandoc_header_slug`]. The resulting set is used by
299/// the `implicit_header_references` extension detector in [`LintContext`].
300///
301/// Pandoc's `auto_identifiers` extension disambiguates duplicate headings by
302/// appending `-1`, `-2`, etc. to the second, third, … occurrence of the same
303/// base slug. Both the base slug and its suffixed forms are inserted so that
304/// links such as `#section` and `#section-1` both resolve.
305///
306/// Lines inside fenced code blocks (delimited by ` ``` ` or `~~~`, >= 3 chars)
307/// are skipped so that bash comments and shebang lines are not mistaken for
308/// headings.
309pub fn collect_pandoc_header_slugs(content: &str) -> std::collections::HashSet<String> {
310    use std::collections::{HashMap, HashSet};
311    let mut slugs = HashSet::new();
312    let mut base_counts: HashMap<String, usize> = HashMap::new();
313    let mut in_fence = false;
314    let mut fence_marker: Option<char> = None;
315    for line in content.lines() {
316        let trimmed = line.trim_start();
317        // Detect fenced code block open/close. Pandoc fences are >= 3 backticks
318        // or >= 3 tildes at the start of a line (after optional indentation).
319        // A closing fence must use the same marker character as the opening one.
320        if let Some(c) = trimmed.chars().next()
321            && (c == '`' || c == '~')
322        {
323            let count = trimmed.chars().take_while(|&ch| ch == c).count();
324            if count >= 3 {
325                match fence_marker {
326                    None => {
327                        in_fence = true;
328                        fence_marker = Some(c);
329                    }
330                    Some(m) if m == c => {
331                        in_fence = false;
332                        fence_marker = None;
333                    }
334                    _ => {}
335                }
336                continue;
337            }
338        }
339        if in_fence {
340            continue;
341        }
342        if let Some(rest) = trimmed.strip_prefix('#') {
343            let mut text = rest.trim_start_matches('#').trim();
344            // Strip trailing `{#id .class}` attribute block only when the `{...}`
345            // extends to the end of the text (possibly followed by whitespace).
346            // This prevents `{` appearing inside heading body text (e.g.
347            // `# Some {curly} word`) from being mistaken for an attribute block.
348            if let Some(idx) = text.rfind(" {")
349                && let Some(close_rel) = text[idx + 2..].find('}')
350                && text[idx + 2 + close_rel + 1..].trim().is_empty()
351            {
352                text = &text[..idx];
353            }
354            let base = pandoc_header_slug(text);
355            let count = base_counts.entry(base.clone()).or_insert(0);
356            let slug = if *count == 0 {
357                base.clone()
358            } else {
359                format!("{base}-{count}")
360            };
361            *count += 1;
362            slugs.insert(slug);
363        }
364    }
365    slugs
366}
367
368// ============================================================================
369// Subscript and Superscript Support
370// ============================================================================
371//
372// Pandoc `subscript` extension: `~x~` where x contains no whitespace or `~`.
373// Pandoc `superscript` extension: `^x^` where x contains no whitespace or `^`.
374//
375// These are distinct from GFM strikethrough (`~~text~~`) and Pandoc inline
376// footnotes (`^[...]`). The disambiguation rule for subscript is: reject any
377// match where the opening or closing `~` is immediately adjacent to another `~`
378// (which would make it GFM strikethrough). For superscript, reject matches
379// where a `^` neighbour would form `^^`.
380
381/// Pattern for Pandoc subscript: `~x~` where x is non-whitespace, non-`~`.
382static SUBSCRIPT_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"~[^\s~]+~").unwrap());
383
384/// Pattern for Pandoc superscript: `^x^` where x is non-whitespace, non-`^`.
385static SUPERSCRIPT_PATTERN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\^[^\s^]+\^").unwrap());
386
387/// Detect Pandoc subscript (`~x~`) and superscript (`^x^`) ranges.
388///
389/// Returns byte ranges covering the full delimited span (including the
390/// delimiter characters). Excludes `~~strikethrough~~` and superscript-like
391/// runs of `^^`. The returned ranges are sorted by `start`.
392///
393/// Note: a `^[…]^` construct will also match `detect_inline_footnote_ranges`.
394/// Rules that distinguish footnotes from superscripts must check both accessors.
395pub fn detect_subscript_superscript_ranges(content: &str) -> Vec<ByteRange> {
396    let bytes = content.as_bytes();
397    let mut ranges = Vec::new();
398
399    for m in SUBSCRIPT_PATTERN.find_iter(content) {
400        // Reject if preceded or followed by `~` (would be strikethrough).
401        let prev = m.start().checked_sub(1).map_or(0, |i| bytes[i]);
402        let next = bytes.get(m.end()).copied().unwrap_or(0);
403        if prev != b'~' && next != b'~' {
404            ranges.push(ByteRange {
405                start: m.start(),
406                end: m.end(),
407            });
408        }
409    }
410    for m in SUPERSCRIPT_PATTERN.find_iter(content) {
411        // Reject if preceded or followed by `^` (would be a `^^` run).
412        let prev = m.start().checked_sub(1).map_or(0, |i| bytes[i]);
413        let next = bytes.get(m.end()).copied().unwrap_or(0);
414        if prev != b'^' && next != b'^' {
415            ranges.push(ByteRange {
416                start: m.start(),
417                end: m.end(),
418            });
419        }
420    }
421    // Sort because the two regex passes are merged and their results may interleave.
422    ranges.sort_by_key(|r| r.start);
423    ranges
424}
425
426// ============================================================================
427// Inline Code Attribute Support
428// ============================================================================
429//
430// Pandoc `inline_code_attributes` extension: `` `code`{.lang} ``
431//
432// The attribute block must immediately follow the closing backtick of the
433// inline code span. Only the `{...}` part is captured; the backtick span
434// itself is already handled by the standard code-span detector.
435
436/// Pattern for inline code attribute: a backtick-quoted span immediately
437/// followed by `{...}`. We capture only the trailing attribute block.
438static INLINE_CODE_ATTR: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"`[^`]*`(\{[^}]+\})").unwrap());
439
440/// Detect Pandoc inline code attribute ranges.
441///
442/// Inline code attributes are written as `` `code`{.lang} ``. Returns the
443/// byte ranges of the trailing `{...}` attribute block only (not the
444/// backticked code itself).
445pub fn detect_inline_code_attr_ranges(content: &str) -> Vec<ByteRange> {
446    let mut ranges = Vec::new();
447    for caps in INLINE_CODE_ATTR.captures_iter(content) {
448        let m = caps.get(1).unwrap();
449        ranges.push(ByteRange {
450            start: m.start(),
451            end: m.end(),
452        });
453    }
454    ranges
455}
456
457// ============================================================================
458// Example List Support
459// ============================================================================
460//
461// Pandoc `example_lists` extension:
462// - Line-start marker: `(@)` or `(@label)` followed by whitespace
463// - Inline reference: `(@label)` appearing mid-paragraph (not at line start)
464//
465// Example keys contain letters, digits, underscores, and hyphens.
466// The anonymous form `(@)` is valid as a marker but cannot appear as a reference
467// (references require a label to be named).
468
469/// Pattern for an example-list marker at line start: `(@)` or `(@label)` followed
470/// by whitespace. Captures the `(@...)` portion.
471static EXAMPLE_LIST_MARKER: LazyLock<Regex> =
472    LazyLock::new(|| Regex::new(r"(?m)^[ \t]*(\(@[A-Za-z0-9_-]*\))[ \t]+").unwrap());
473
474/// Pattern for an example reference: `(@label)` anywhere in text. Used together
475/// with the marker pre-pass to filter out line-start markers.
476static EXAMPLE_REFERENCE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(\(@[A-Za-z0-9_-]+\))").unwrap());
477
478/// Detect Pandoc example-list marker ranges (`(@)` / `(@label)` at line start).
479///
480/// Returns byte ranges covering the `(@...)` portion of each marker. Used by
481/// rules that process list markers to skip Pandoc example markers.
482pub fn detect_example_list_marker_ranges(content: &str) -> Vec<ByteRange> {
483    let mut ranges = Vec::new();
484    for caps in EXAMPLE_LIST_MARKER.captures_iter(content) {
485        let m = caps.get(1).unwrap();
486        ranges.push(ByteRange {
487            start: m.start(),
488            end: m.end(),
489        });
490    }
491    ranges
492}
493
494/// Detect Pandoc example reference ranges (`(@label)` not at line start).
495///
496/// Excludes positions whose start byte appears in `marker_ranges` (those are
497/// line-start markers, not references). The caller must pass the already-computed
498/// result of [`detect_example_list_marker_ranges`] so the marker regex is not
499/// executed a second time.
500pub fn detect_example_reference_ranges(content: &str, marker_ranges: &[ByteRange]) -> Vec<ByteRange> {
501    let mut ranges = Vec::new();
502    let marker_starts: std::collections::HashSet<usize> = marker_ranges.iter().map(|r| r.start).collect();
503    for caps in EXAMPLE_REFERENCE.captures_iter(content) {
504        let m = caps.get(1).unwrap();
505        if !marker_starts.contains(&m.start()) {
506            ranges.push(ByteRange {
507                start: m.start(),
508                end: m.end(),
509            });
510        }
511    }
512    ranges
513}
514
515// ============================================================================
516// Bracketed Span Support
517// ============================================================================
518//
519// Pandoc `bracketed_spans` extension: `[text]{attrs}` where attrs is a
520// non-empty Pandoc attribute block.
521//
522// Distinguished from `[text](url)` (link) and `[text][ref]` (reference link)
523// by requiring `]{` immediately adjacent — the `{` must directly follow `]`
524// with no intervening characters.
525
526/// Pattern for Pandoc bracketed span: `[text]{attrs}` where attrs is a
527/// non-empty Pandoc attribute block. The regex requires `]{` immediately
528/// adjacent (no characters between `]` and `{`), which excludes `[text](url)`
529/// links and `[text][ref]` reference links.
530static BRACKETED_SPAN: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[[^\]]+\]\{[^}]+\}").unwrap());
531
532/// Detect Pandoc bracketed span ranges (`[text]{attrs}`).
533///
534/// Returns byte ranges covering the full `[...]` + `{...}` span. The detector
535/// is structural only — it does not validate `attrs` content.
536pub fn detect_bracketed_span_ranges(content: &str) -> Vec<ByteRange> {
537    let mut ranges = Vec::new();
538    for m in BRACKETED_SPAN.find_iter(content) {
539        ranges.push(ByteRange {
540            start: m.start(),
541            end: m.end(),
542        });
543    }
544    ranges
545}
546
547// ============================================================================
548// Line Block Support
549// ============================================================================
550//
551// Pandoc `line_blocks` extension: a contiguous run of lines starting with `| `
552// (pipe space). Each line in a line block is rendered as a separate line of
553// verse or address. Continuation lines — indented, non-empty, not starting
554// with `|` — extend the immediately preceding block line.
555//
556// Distinguished from pipe tables: a line whose trimmed form ends with `|`
557// (i.e. `| col1 | col2 |`) is a table row, not a line block entry.
558
559/// Detect Pandoc line blocks (consecutive lines starting with `| `).
560///
561/// A line block is a contiguous run of lines where each line either:
562/// - Starts with `| ` (a single pipe followed by space) and does NOT
563///   end with `|` (which would be a pipe-table row), or
564/// - Is a continuation line (whitespace-indented, non-empty, not starting
565///   with `|`) appearing within an active line-block run.
566///
567/// A blank line ends the run.
568pub fn detect_line_block_ranges(content: &str) -> Vec<ByteRange> {
569    let mut ranges = Vec::new();
570    let mut in_block = false;
571    let mut block_start = 0usize;
572    let mut block_end = 0usize;
573    let mut byte_offset = 0usize;
574
575    for line in content.split_inclusive('\n') {
576        let trimmed = line.trim_end_matches('\n').trim_end_matches('\r');
577        let is_line_block_line = trimmed.starts_with("| ") && !trimmed.trim_end().ends_with('|');
578        let is_continuation = in_block
579            && !trimmed.is_empty()
580            && trimmed.starts_with(|c: char| c.is_whitespace())
581            && !trimmed.trim_start().starts_with('|');
582
583        if is_line_block_line || is_continuation {
584            if !in_block {
585                block_start = byte_offset;
586                in_block = true;
587            }
588            block_end = byte_offset + line.len();
589        } else if in_block {
590            ranges.push(ByteRange {
591                start: block_start,
592                end: block_end,
593            });
594            in_block = false;
595        }
596        byte_offset += line.len();
597    }
598    if in_block {
599        ranges.push(ByteRange {
600            start: block_start,
601            end: block_end,
602        });
603    }
604    ranges
605}
606
607// ============================================================================
608// Pipe-Table Caption Support
609// ============================================================================
610//
611// Pandoc `table_captions` extension: a `: caption text` line that appears
612// adjacent to a pipe table, separated by exactly one blank line (either
613// above or below). Without the blank-line adjacency to a pipe table, a
614// `: text` line is a definition-list value and must NOT be matched here.
615//
616// Matching rule:
617//   caption_below: caption at line i, blank at i+1, pipe-table row at i+2
618//   caption_above: pipe-table row at i-2, blank at i-1, caption at i
619
620/// Detect Pandoc pipe-table caption lines (`: caption`) adjacent (above or
621/// below, separated by exactly one blank line) to a pipe table. A `: text`
622/// line not adjacent to a table is treated as a definition-list value and
623/// is not matched here.
624///
625/// Iterates with `split_inclusive('\n')` so byte ranges remain accurate for
626/// content without a trailing newline and for CRLF line endings.
627pub fn detect_pipe_table_caption_ranges(content: &str) -> Vec<ByteRange> {
628    let mut lines: Vec<&str> = Vec::new();
629    let mut line_offsets: Vec<usize> = Vec::new();
630    let mut offset = 0usize;
631    for line in content.split_inclusive('\n') {
632        line_offsets.push(offset);
633        lines.push(line);
634        offset += line.len();
635    }
636    line_offsets.push(offset);
637
638    fn line_body(line: &str) -> &str {
639        line.trim_end_matches('\n').trim_end_matches('\r')
640    }
641    fn is_pipe_table_row(line: &str) -> bool {
642        let t = line_body(line).trim();
643        t.starts_with('|') && t.ends_with('|') && t.len() >= 3
644    }
645    fn is_caption_line(line: &str) -> bool {
646        line_body(line).trim_start().starts_with(": ")
647    }
648    fn is_blank(line: &str) -> bool {
649        line_body(line).trim().is_empty()
650    }
651
652    let mut ranges = Vec::new();
653    for (i, line) in lines.iter().enumerate() {
654        if !is_caption_line(line) {
655            continue;
656        }
657        let table_below = i + 2 < lines.len() && is_blank(lines[i + 1]) && is_pipe_table_row(lines[i + 2]);
658        let table_above = i >= 2 && is_blank(lines[i - 1]) && is_pipe_table_row(lines[i - 2]);
659        if table_below || table_above {
660            ranges.push(ByteRange {
661                start: line_offsets[i],
662                end: line_offsets[i + 1],
663            });
664        }
665    }
666    ranges
667}
668
669// ============================================================================
670// YAML Metadata Block Support
671// ============================================================================
672//
673// Pandoc `yaml_metadata_block` extension: one or more `---`-delimited YAML
674// blocks anywhere in the document. Unlike standard frontmatter (single block
675// at file start), Pandoc allows:
676//   - Multiple blocks per document
677//   - `---` opener
678//   - Either `---` or `...` as the closer
679//   - Opener must be at start-of-file OR immediately after a blank line
680//   - Unterminated openers are skipped
681
682/// Detect Pandoc YAML metadata blocks (`---...---` or `---...`).
683/// Unlike standard frontmatter, these can appear anywhere in the document
684/// and there can be multiple per file.
685pub fn detect_yaml_metadata_block_ranges(content: &str) -> Vec<ByteRange> {
686    let mut lines: Vec<&str> = Vec::new();
687    let mut line_offsets: Vec<usize> = Vec::new();
688    let mut offset = 0usize;
689    for line in content.split_inclusive('\n') {
690        line_offsets.push(offset);
691        lines.push(line);
692        offset += line.len();
693    }
694    line_offsets.push(offset);
695
696    fn line_body(line: &str) -> &str {
697        line.trim_end_matches('\n').trim_end_matches('\r')
698    }
699    fn is_blank(line: &str) -> bool {
700        line_body(line).trim().is_empty()
701    }
702    fn is_opener(line: &str) -> bool {
703        line_body(line).trim_end() == "---"
704    }
705    fn is_closer(line: &str) -> bool {
706        let t = line_body(line).trim_end();
707        t == "---" || t == "..."
708    }
709
710    let mut ranges = Vec::new();
711    let mut i = 0;
712    while i < lines.len() {
713        let preceded_by_blank = i == 0 || is_blank(lines[i - 1]);
714        if preceded_by_blank && is_opener(lines[i]) {
715            let mut j = i + 1;
716            let mut found_closer = false;
717            while j < lines.len() {
718                if is_closer(lines[j]) {
719                    ranges.push(ByteRange {
720                        start: line_offsets[i],
721                        end: line_offsets[j + 1],
722                    });
723                    i = j + 1;
724                    found_closer = true;
725                    break;
726                }
727                j += 1;
728            }
729            if !found_closer {
730                // Unterminated opener — skip and continue scanning.
731                i += 1;
732            }
733        } else {
734            i += 1;
735        }
736    }
737    ranges
738}
739
740// ============================================================================
741// Grid Table Support
742// ============================================================================
743//
744// Pandoc `grid_tables` extension: a contiguous block of lines where the
745// first line is a `+---+---+` border row (`+` corners, `-` or `=` between),
746// followed by alternating content rows (`| ... | ... |`) and border rows
747// (`+---+---+` or `+===+===+`), ending with a closing border row.
748// At least one content row is required for a valid grid table.
749
750/// Pattern for a grid-table border row: `+---+---+` or `+===+===+`.
751static GRID_BORDER: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\+(?:[-=]+\+)+\s*$").unwrap());
752
753/// Pattern for a grid-table content row: `| ... | ... |`.
754static GRID_CONTENT: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^\|.*\|\s*$").unwrap());
755
756/// Detect Pandoc grid tables. A grid table is a contiguous run of lines
757/// where the first line is a `+---+---+` border, followed by alternating
758/// content rows `|...|` and border rows, and ending in a border row.
759/// At least one content row is required.
760///
761/// Iterates with `split_inclusive('\n')` so byte ranges remain accurate for
762/// content without a trailing newline and for CRLF line endings.
763pub fn detect_grid_table_ranges(content: &str) -> Vec<ByteRange> {
764    let mut lines: Vec<&str> = Vec::new();
765    let mut line_offsets: Vec<usize> = Vec::new();
766    let mut offset = 0usize;
767    for line in content.split_inclusive('\n') {
768        line_offsets.push(offset);
769        lines.push(line);
770        offset += line.len();
771    }
772    line_offsets.push(offset);
773
774    fn line_body(line: &str) -> &str {
775        line.trim_end_matches('\n').trim_end_matches('\r')
776    }
777    fn is_border(line: &str) -> bool {
778        GRID_BORDER.is_match(line_body(line))
779    }
780    fn is_content(line: &str) -> bool {
781        GRID_CONTENT.is_match(line_body(line))
782    }
783
784    let mut ranges = Vec::new();
785    let mut i = 0;
786    while i < lines.len() {
787        if is_border(lines[i]) {
788            let start_line = i;
789            let mut j = i + 1;
790            let mut last_border = i;
791            let mut saw_content = false;
792            while j < lines.len() {
793                if is_border(lines[j]) {
794                    last_border = j;
795                    j += 1;
796                } else if is_content(lines[j]) {
797                    saw_content = true;
798                    j += 1;
799                } else {
800                    break;
801                }
802            }
803            // A valid grid table needs at least one content row and a
804            // closing border (last_border > start_line).
805            if saw_content && last_border > start_line {
806                ranges.push(ByteRange {
807                    start: line_offsets[start_line],
808                    end: line_offsets[last_border + 1],
809                });
810                i = last_border + 1;
811                continue;
812            }
813        }
814        i += 1;
815    }
816    ranges
817}
818
819// ============================================================================
820// Multi-line Table Support
821// ============================================================================
822//
823// Pandoc `multiline_tables` extension: a block whose column widths are declared
824// by an underline row of dashes-separated-by-spaces (MULTI_LINE_UNDERLINE), with
825// an optional top-border and a mandatory closing solid-dash row (MULTI_LINE_BORDER).
826// The header line immediately precedes the underline row.
827
828/// Pattern for a multi-line table column-width underline row.
829/// Matches two or more runs of dashes separated by spaces, e.g.:
830/// `----------- ------- --------------- -------------------------`
831static MULTI_LINE_UNDERLINE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^-{2,}(?:\s+-{2,})+\s*$").unwrap());
832
833/// Pattern for a multi-line table solid border row (≥10 dashes).
834/// Used as both an optional top border and the mandatory closing border.
835static MULTI_LINE_BORDER: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^-{10,}\s*$").unwrap());
836
837/// Detect Pandoc multi-line table ranges.
838///
839/// A multi-line table is recognised by an underline row (dashes separated by
840/// spaces, ≥2 columns) immediately following a non-empty header line. The table
841/// extends to the next solid-dash border row (≥10 dashes). An optional solid
842/// border may appear before the header as well.
843///
844/// Iterates with `split_inclusive('\n')` so byte ranges remain accurate for
845/// content without a trailing newline and for CRLF line endings.
846pub fn detect_multi_line_table_ranges(content: &str) -> Vec<ByteRange> {
847    let mut lines: Vec<&str> = Vec::new();
848    let mut line_offsets: Vec<usize> = Vec::new();
849    let mut offset = 0usize;
850    for line in content.split_inclusive('\n') {
851        line_offsets.push(offset);
852        lines.push(line);
853        offset += line.len();
854    }
855    line_offsets.push(offset);
856
857    fn line_body(line: &str) -> &str {
858        line.trim_end_matches('\n').trim_end_matches('\r')
859    }
860    fn is_underline(line: &str) -> bool {
861        MULTI_LINE_UNDERLINE.is_match(line_body(line))
862    }
863    fn is_border(line: &str) -> bool {
864        MULTI_LINE_BORDER.is_match(line_body(line))
865    }
866
867    let mut ranges = Vec::new();
868    let mut i = 0;
869    while i < lines.len() {
870        // Look for an underline row whose previous line is a non-empty header.
871        if i >= 1 && is_underline(lines[i]) && !line_body(lines[i - 1]).is_empty() {
872            // Walk backward from i-1 to find the first line of the header block.
873            // The header may span multiple lines; keep going back while lines are
874            // non-empty and not themselves borders or underlines.
875            let mut header_start = i - 1;
876            while header_start > 0
877                && !line_body(lines[header_start - 1]).is_empty()
878                && !is_border(lines[header_start - 1])
879                && !is_underline(lines[header_start - 1])
880            {
881                header_start -= 1;
882            }
883
884            // Optionally include a solid border that precedes the header block.
885            let start_line = if header_start > 0 && is_border(lines[header_start - 1]) {
886                header_start - 1
887            } else {
888                header_start
889            };
890
891            // Walk forward from the line after the underline to find the closing border.
892            let mut j = i + 1;
893            let mut end_line: Option<usize> = None;
894            while j < lines.len() {
895                if is_border(lines[j]) {
896                    // Closing solid-dash border found.
897                    end_line = Some(j);
898                    break;
899                } else if j > i + 1 && is_underline(lines[j]) {
900                    // Another column-width underline (second header section?):
901                    // the previous line is the last body line.
902                    end_line = Some(j - 1);
903                    break;
904                }
905                j += 1;
906            }
907
908            if let Some(end) = end_line {
909                ranges.push(ByteRange {
910                    start: line_offsets[start_line],
911                    end: line_offsets[end + 1],
912                });
913                i = end + 1;
914                continue;
915            }
916            // No closing border found — skip this candidate and keep walking.
917        }
918        i += 1;
919    }
920    ranges
921}
922
923/// Detect Pandoc inline footnote ranges (`^[note text]`).
924///
925/// Returns byte ranges covering the entire `^[...]` span. Intended for rules that
926/// process bracket-like syntax to skip Pandoc inline footnotes.
927pub fn detect_inline_footnote_ranges(content: &str) -> Vec<ByteRange> {
928    let mut ranges = Vec::new();
929    for caps in INLINE_FOOTNOTE_PATTERN.captures_iter(content) {
930        let m = caps.get(1).unwrap();
931        ranges.push(ByteRange {
932            start: m.start(),
933            end: m.end(),
934        });
935    }
936    ranges
937}
938
939/// Find all citation ranges in content (byte ranges)
940/// Returns ranges for both bracketed `[@key]` and inline `@key` citations.
941///
942/// Markdown link labels are excluded: when `[text]` is immediately followed
943/// by `(` (inline link) or `[` (reference link), Pandoc prefers the link
944/// parse over the citation parse, so any `@key` mentioned inside `text`
945/// is not a citation. The link-label scan covers both bracketed-form
946/// matches and free-floating inline `@key` matches.
947pub fn find_citation_ranges(content: &str) -> Vec<ByteRange> {
948    let mut ranges = Vec::new();
949
950    // Pre-compute Markdown link-label byte ranges (the `[text]` portion of
951    // `[text](url)` or `[text][ref]`).
952    let link_label_ranges: Vec<(usize, usize)> = LINK_LABEL_PATTERN
953        .captures_iter(content)
954        .filter_map(|c| c.get(1).map(|m| (m.start(), m.end())))
955        .collect();
956
957    let in_link_label = |pos: usize| -> bool { link_label_ranges.iter().any(|&(s, e)| pos >= s && pos < e) };
958
959    // Find bracketed citations first (higher priority)
960    for mat in BRACKETED_CITATION_PATTERN.find_iter(content) {
961        if in_link_label(mat.start()) {
962            continue;
963        }
964        ranges.push(ByteRange {
965            start: mat.start(),
966            end: mat.end(),
967        });
968    }
969
970    // Find inline citations (but not inside already-found brackets or link labels)
971    for cap in INLINE_CITATION_PATTERN.captures_iter(content) {
972        if let Some(mat) = cap.get(1) {
973            let start = mat.start();
974            if in_link_label(start) {
975                continue;
976            }
977            // Skip if this is inside a bracketed citation
978            if !ranges.iter().any(|r| start >= r.start && start < r.end) {
979                ranges.push(ByteRange { start, end: mat.end() });
980            }
981        }
982    }
983
984    // Sort by start position
985    ranges.sort_by_key(|r| r.start);
986    ranges
987}
988
989#[cfg(test)]
990mod tests {
991    use super::*;
992
993    #[test]
994    fn test_div_open_detection() {
995        // Valid div openings
996        assert!(is_div_open("::: {.callout-note}"));
997        assert!(is_div_open("::: {.callout-warning}"));
998        assert!(is_div_open("::: {#myid .class}"));
999        assert!(is_div_open("::: bordered"));
1000        assert!(is_div_open("  ::: {.note}")); // Indented
1001        assert!(is_div_open("::: {.callout-tip title=\"My Title\"}"));
1002
1003        // Invalid patterns
1004        assert!(!is_div_open(":::")); // Just closing marker
1005        assert!(!is_div_open(":::  ")); // Just closing with trailing space
1006        assert!(!is_div_open("Regular text"));
1007        assert!(!is_div_open("# Heading"));
1008        assert!(!is_div_open("```python")); // Code fence
1009    }
1010
1011    #[test]
1012    fn test_div_close_detection() {
1013        assert!(is_div_close(":::"));
1014        assert!(is_div_close(":::  "));
1015        assert!(is_div_close("  :::"));
1016        assert!(is_div_close("    :::  "));
1017
1018        assert!(!is_div_close("::: {.note}"));
1019        assert!(!is_div_close("::: class"));
1020        assert!(!is_div_close(":::note"));
1021    }
1022
1023    #[test]
1024    fn test_callout_detection() {
1025        assert!(is_callout_open("::: {.callout-note}"));
1026        assert!(is_callout_open("::: {.callout-warning}"));
1027        assert!(is_callout_open("::: {.callout-tip}"));
1028        assert!(is_callout_open("::: {.callout-important}"));
1029        assert!(is_callout_open("::: {.callout-caution}"));
1030        assert!(is_callout_open("::: {#myid .callout-note}"));
1031        assert!(is_callout_open("::: {.callout-note title=\"Title\"}"));
1032
1033        assert!(!is_callout_open("::: {.note}")); // Not a callout
1034        assert!(!is_callout_open("::: {.bordered}")); // Not a callout
1035        assert!(!is_callout_open("::: callout-note")); // Missing braces
1036    }
1037
1038    #[test]
1039    fn test_div_tracker() {
1040        let mut tracker = DivTracker::new();
1041
1042        // Enter a div
1043        assert!(tracker.process_line("::: {.callout-note}"));
1044        assert!(tracker.is_inside_div());
1045
1046        // Inside content
1047        assert!(tracker.process_line("This is content."));
1048        assert!(tracker.is_inside_div());
1049
1050        // Exit the div
1051        assert!(!tracker.process_line(":::"));
1052        assert!(!tracker.is_inside_div());
1053    }
1054
1055    #[test]
1056    fn test_nested_divs() {
1057        let mut tracker = DivTracker::new();
1058
1059        // Outer div
1060        assert!(tracker.process_line("::: {.outer}"));
1061        assert!(tracker.is_inside_div());
1062
1063        // Inner div
1064        assert!(tracker.process_line("  ::: {.inner}"));
1065        assert!(tracker.is_inside_div());
1066
1067        // Content
1068        assert!(tracker.process_line("    Content"));
1069        assert!(tracker.is_inside_div());
1070
1071        // Close inner
1072        assert!(tracker.process_line("  :::"));
1073        assert!(tracker.is_inside_div());
1074
1075        // Close outer
1076        assert!(!tracker.process_line(":::"));
1077        assert!(!tracker.is_inside_div());
1078    }
1079
1080    #[test]
1081    fn test_detect_div_block_ranges() {
1082        let content = r#"# Heading
1083
1084::: {.callout-note}
1085This is a note.
1086:::
1087
1088Regular text.
1089
1090::: {.bordered}
1091Content here.
1092:::
1093"#;
1094        let ranges = detect_div_block_ranges(content);
1095        assert_eq!(ranges.len(), 2);
1096
1097        // First div
1098        let first_div_content = &content[ranges[0].start..ranges[0].end];
1099        assert!(first_div_content.contains("callout-note"));
1100        assert!(first_div_content.contains("This is a note"));
1101
1102        // Second div
1103        let second_div_content = &content[ranges[1].start..ranges[1].end];
1104        assert!(second_div_content.contains("bordered"));
1105        assert!(second_div_content.contains("Content here"));
1106    }
1107
1108    #[test]
1109    fn test_pandoc_attributes() {
1110        assert!(has_pandoc_attributes("# Heading {#custom-id}"));
1111        assert!(has_pandoc_attributes("# Heading {.unnumbered}"));
1112        assert!(has_pandoc_attributes("![Image](path.png){#fig-1 width=\"50%\"}"));
1113        assert!(has_pandoc_attributes("{#id .class key=\"value\"}"));
1114
1115        assert!(!has_pandoc_attributes("# Heading"));
1116        assert!(!has_pandoc_attributes("Regular text"));
1117        assert!(!has_pandoc_attributes("{}"));
1118    }
1119
1120    #[test]
1121    fn test_div_with_title_attribute() {
1122        let content = r#"::: {.callout-note title="Important Note"}
1123This is the content of the note.
1124It can span multiple lines.
1125:::
1126"#;
1127        let ranges = detect_div_block_ranges(content);
1128        assert_eq!(ranges.len(), 1);
1129        assert!(is_callout_open("::: {.callout-note title=\"Important Note\"}"));
1130    }
1131
1132    #[test]
1133    fn test_unclosed_div() {
1134        let content = r#"::: {.callout-note}
1135This note is never closed.
1136"#;
1137        let ranges = detect_div_block_ranges(content);
1138        assert_eq!(ranges.len(), 1);
1139        // Should include all content to end of document
1140        assert_eq!(ranges[0].end, content.len());
1141    }
1142
1143    #[test]
1144    fn test_heading_inside_callout() {
1145        let content = r#"::: {.callout-warning}
1146## Warning Title
1147
1148Warning content here.
1149:::
1150"#;
1151        let ranges = detect_div_block_ranges(content);
1152        assert_eq!(ranges.len(), 1);
1153
1154        let div_content = &content[ranges[0].start..ranges[0].end];
1155        assert!(div_content.contains("## Warning Title"));
1156    }
1157
1158    // Citation tests
1159    #[test]
1160    fn test_has_citations() {
1161        assert!(has_citations("See @smith2020 for details."));
1162        assert!(has_citations("[@smith2020]"));
1163        assert!(has_citations("Multiple [@a; @b] citations"));
1164        assert!(!has_citations("No citations here"));
1165        // has_citations is just a quick @ check - emails will pass (intended behavior)
1166        assert!(has_citations("Email: user@example.com"));
1167    }
1168
1169    #[test]
1170    fn test_bracketed_citation_detection() {
1171        let content = "See [@smith2020] for more info.";
1172        let ranges = find_citation_ranges(content);
1173        assert_eq!(ranges.len(), 1);
1174        assert_eq!(&content[ranges[0].start..ranges[0].end], "[@smith2020]");
1175    }
1176
1177    #[test]
1178    fn test_inline_citation_detection() {
1179        let content = "As @smith2020 argues, this is true.";
1180        let ranges = find_citation_ranges(content);
1181        assert_eq!(ranges.len(), 1);
1182        assert_eq!(&content[ranges[0].start..ranges[0].end], "@smith2020");
1183    }
1184
1185    #[test]
1186    fn test_multiple_citations_in_brackets() {
1187        let content = "See [@smith2020; @jones2021] for details.";
1188        let ranges = find_citation_ranges(content);
1189        assert_eq!(ranges.len(), 1);
1190        assert_eq!(&content[ranges[0].start..ranges[0].end], "[@smith2020; @jones2021]");
1191    }
1192
1193    #[test]
1194    fn test_citation_with_prefix() {
1195        let content = "[see @smith2020, p. 10]";
1196        let ranges = find_citation_ranges(content);
1197        assert_eq!(ranges.len(), 1);
1198        assert_eq!(&content[ranges[0].start..ranges[0].end], "[see @smith2020, p. 10]");
1199    }
1200
1201    #[test]
1202    fn test_suppress_author_citation() {
1203        let content = "The theory [-@smith2020] states that...";
1204        let ranges = find_citation_ranges(content);
1205        assert_eq!(ranges.len(), 1);
1206        assert_eq!(&content[ranges[0].start..ranges[0].end], "[-@smith2020]");
1207    }
1208
1209    #[test]
1210    fn test_mixed_citations() {
1211        let content = "@smith2020 argues that [@jones2021] is wrong.";
1212        let ranges = find_citation_ranges(content);
1213        assert_eq!(ranges.len(), 2);
1214        // Inline citation
1215        assert_eq!(&content[ranges[0].start..ranges[0].end], "@smith2020");
1216        // Bracketed citation
1217        assert_eq!(&content[ranges[1].start..ranges[1].end], "[@jones2021]");
1218    }
1219
1220    #[test]
1221    fn test_email_not_confused_with_citation() {
1222        // Email addresses should not match as inline citations when properly filtered
1223        // The has_citations() is just a quick check, but find_citation_ranges uses more strict patterns
1224        let content = "Contact user@example.com for help.";
1225        let ranges = find_citation_ranges(content);
1226        // Email should not be detected as citation (@ is preceded by alphanumeric)
1227        assert!(
1228            ranges.is_empty()
1229                || !ranges.iter().any(|r| {
1230                    let s = &content[r.start..r.end];
1231                    s.contains("example.com")
1232                })
1233        );
1234    }
1235
1236    /// Bracketed link text containing an email (`@` embedded in a word) must
1237    /// NOT be classified as a Pandoc citation. A citation `@key` requires the
1238    /// `@` to sit at a citation boundary — start of bracket, after `-`, after
1239    /// whitespace, or after `;` — never in the middle of a word like an email.
1240    #[test]
1241    fn test_bracketed_link_text_with_email_not_citation() {
1242        let content = "[contact user@example.com](#missing)";
1243        let ranges = find_citation_ranges(content);
1244        assert!(
1245            ranges.is_empty(),
1246            "Bracketed link text with embedded email must not be detected as a Pandoc citation: {ranges:?}"
1247        );
1248    }
1249
1250    /// Same bracketed text with an empty link target — also a link, not a citation.
1251    #[test]
1252    fn test_bracketed_link_text_with_email_empty_href_not_citation() {
1253        let content = "[contact user@example.com]()";
1254        let ranges = find_citation_ranges(content);
1255        assert!(
1256            ranges.is_empty(),
1257            "Bracketed link text with embedded email and empty href must not be a Pandoc citation: {ranges:?}"
1258        );
1259    }
1260
1261    /// A bracketed label whose text mentions a citation key but is *immediately*
1262    /// followed by a link target `(...)` is a Markdown link, not a citation.
1263    /// Pandoc itself prefers the link interpretation for `[text](url)` over a
1264    /// citation parse, even when `text` contains `@key`.
1265    #[test]
1266    fn test_bracketed_text_followed_by_inline_link_not_citation() {
1267        let content = "[see @smith2020](#missing)";
1268        let ranges = find_citation_ranges(content);
1269        assert!(
1270            ranges.is_empty(),
1271            "Bracketed text followed by `(...)` is a link, not a citation: {ranges:?}"
1272        );
1273    }
1274
1275    /// Same disambiguation when the link target is empty: still a link.
1276    #[test]
1277    fn test_bracketed_text_followed_by_empty_inline_link_not_citation() {
1278        let content = "[see @smith2020]()";
1279        let ranges = find_citation_ranges(content);
1280        assert!(
1281            ranges.is_empty(),
1282            "Bracketed text followed by `()` is a link with empty href, not a citation: {ranges:?}"
1283        );
1284    }
1285
1286    /// Reference-style links `[text][ref]` are also links — the bracketed
1287    /// label `[text]` must not be classified as a citation just because it
1288    /// contains `@key`.
1289    #[test]
1290    fn test_bracketed_text_followed_by_reference_link_not_citation() {
1291        let content = "[see @smith2020][ref]";
1292        let ranges = find_citation_ranges(content);
1293        assert!(
1294            ranges.is_empty(),
1295            "Bracketed text followed by `[ref]` is a reference link, not a citation: {ranges:?}"
1296        );
1297    }
1298
1299    /// Standalone bracketed citations remain citations: nothing immediately
1300    /// follows the closing `]`, so the link disambiguation does not apply.
1301    #[test]
1302    fn test_standalone_bracketed_citation_still_detected() {
1303        let content = "See [see @smith2020] for details.";
1304        let ranges = find_citation_ranges(content);
1305        assert!(
1306            ranges.iter().any(|r| &content[r.start..r.end] == "[see @smith2020]"),
1307            "Standalone bracketed citation must still be detected: {ranges:?}"
1308        );
1309    }
1310
1311    /// Citation followed by sentence punctuation remains a citation.
1312    #[test]
1313    fn test_bracketed_citation_followed_by_punctuation_still_detected() {
1314        let content = "Note [@smith2020].";
1315        let ranges = find_citation_ranges(content);
1316        assert!(
1317            ranges.iter().any(|r| &content[r.start..r.end] == "[@smith2020]"),
1318            "Bracketed citation followed by `.` must still be detected: {ranges:?}"
1319        );
1320    }
1321
1322    #[test]
1323    fn test_detect_inline_footnotes() {
1324        let content = "See ^[a quick note] here.\nAnd ^[another one] too.\n";
1325        let ranges = detect_inline_footnote_ranges(content);
1326        assert_eq!(ranges.len(), 2);
1327        // First footnote
1328        let first_start = content.find("^[").unwrap();
1329        let first_end = content[first_start..].find(']').unwrap() + first_start + 1;
1330        assert_eq!(ranges[0].start, first_start);
1331        assert_eq!(ranges[0].end, first_end);
1332        // Second footnote
1333        let second_start = content[first_end..].find("^[").unwrap() + first_end;
1334        let second_end = content[second_start..].find(']').unwrap() + second_start + 1;
1335        assert_eq!(ranges[1].start, second_start);
1336        assert_eq!(ranges[1].end, second_end);
1337    }
1338
1339    #[test]
1340    fn test_inline_footnote_with_brackets_inside() {
1341        // Inline footnotes do not nest; a `]` inside terminates the footnote.
1342        // This documents the chosen behavior. Pandoc itself supports nesting via
1343        // backslash-escapes; rumdl currently treats the first unescaped `]` as
1344        // the terminator.
1345        let content = "Note ^[ref to [other] thing] here.\n";
1346        let ranges = detect_inline_footnote_ranges(content);
1347        assert_eq!(ranges.len(), 1);
1348    }
1349
1350    #[test]
1351    fn test_inline_footnote_does_not_match_image_or_link() {
1352        // `![alt]` is an image, not a footnote.
1353        let content = "An image ![alt](url) and a link [txt](url).\n";
1354        let ranges = detect_inline_footnote_ranges(content);
1355        assert_eq!(ranges.len(), 0);
1356    }
1357
1358    #[test]
1359    fn test_implicit_header_reference_slug() {
1360        // Pandoc lowercases, replaces internal whitespace with `-`, and strips
1361        // punctuation other than `_`, `-`, `.`.
1362        assert_eq!(pandoc_header_slug("My Section"), "my-section");
1363        assert_eq!(pandoc_header_slug("API: v2!"), "api-v2");
1364        assert_eq!(pandoc_header_slug("  Trim Me  "), "trim-me");
1365        assert_eq!(pandoc_header_slug("Multiple   Spaces"), "multiple-spaces");
1366    }
1367
1368    #[test]
1369    fn test_collect_pandoc_header_slugs() {
1370        let content = "# My Section\n\n## Sub-section\n\nbody\n";
1371        let slugs = collect_pandoc_header_slugs(content);
1372        assert!(slugs.contains("my-section"));
1373        assert!(slugs.contains("sub-section"));
1374    }
1375
1376    #[test]
1377    fn test_collect_pandoc_header_slugs_strips_attribute_block() {
1378        let content = "# My Section {#custom-id .red}\n## Plain Section\n";
1379        let slugs = collect_pandoc_header_slugs(content);
1380        assert!(slugs.contains("my-section"));
1381        assert!(slugs.contains("plain-section"));
1382        // Slug must not include the attribute block contents.
1383        assert!(!slugs.iter().any(|s| s.contains("custom-id")));
1384    }
1385
1386    #[test]
1387    fn test_collect_pandoc_header_slugs_preserves_body_braces() {
1388        // `{` in heading body must NOT be mistaken for an attribute block.
1389        let content = "# Some {curly} word in title\n";
1390        let slugs = collect_pandoc_header_slugs(content);
1391        assert!(slugs.contains("some-curly-word-in-title"));
1392    }
1393
1394    #[test]
1395    fn test_collect_pandoc_header_slugs_disambiguates_duplicates() {
1396        // Pandoc's auto_identifiers extension assigns the second heading with the
1397        // same slug `<base>-1`, the third `<base>-2`, etc. Both base and suffixed
1398        // forms must be reachable as link targets.
1399        let content = "# A.\n\nbody\n\n# A.\n";
1400        let slugs = collect_pandoc_header_slugs(content);
1401        assert!(slugs.contains("a."), "first occurrence should expose base slug `a.`");
1402        assert!(
1403            slugs.contains("a.-1"),
1404            "second occurrence should expose `a.-1`: got {slugs:?}"
1405        );
1406    }
1407
1408    #[test]
1409    fn test_collect_pandoc_header_slugs_three_duplicates_get_two_suffixes() {
1410        let content = "# Intro\n\n# Intro\n\n# Intro\n";
1411        let slugs = collect_pandoc_header_slugs(content);
1412        assert!(slugs.contains("intro"));
1413        assert!(slugs.contains("intro-1"));
1414        assert!(slugs.contains("intro-2"));
1415        assert!(
1416            !slugs.contains("intro-3"),
1417            "three occurrences must produce only -1 and -2 suffixes, not -3: got {slugs:?}"
1418        );
1419    }
1420
1421    #[test]
1422    fn test_collect_pandoc_header_slugs_unique_headings_get_no_suffix() {
1423        let content = "# Foo\n\n# Bar\n\n# Baz\n";
1424        let slugs = collect_pandoc_header_slugs(content);
1425        assert!(slugs.contains("foo"));
1426        assert!(slugs.contains("bar"));
1427        assert!(slugs.contains("baz"));
1428        // Unique headings must not gain a `-1` suffix.
1429        assert!(!slugs.contains("foo-1"));
1430        assert!(!slugs.contains("bar-1"));
1431        assert!(!slugs.contains("baz-1"));
1432    }
1433
1434    #[test]
1435    fn test_detect_example_list_markers() {
1436        let content = "(@)  First item.\n(@good) Second item.\n(@) Third item.\n";
1437        let ranges = detect_example_list_marker_ranges(content);
1438        assert_eq!(ranges.len(), 3);
1439        assert_eq!(ranges[0].start, 0);
1440        assert_eq!(&content[ranges[0].start..ranges[0].end], "(@)");
1441        let second_start = content.find("(@good)").unwrap();
1442        assert_eq!(ranges[1].start, second_start);
1443        assert_eq!(&content[ranges[1].start..ranges[1].end], "(@good)");
1444    }
1445
1446    #[test]
1447    fn test_detect_example_references() {
1448        // `(@label)` mid-paragraph is a reference, not a list marker.
1449        let content = "As shown in (@good), this works.\n";
1450        let marker_ranges = detect_example_list_marker_ranges(content);
1451        let ranges = detect_example_reference_ranges(content, &marker_ranges);
1452        assert_eq!(ranges.len(), 1);
1453    }
1454
1455    #[test]
1456    fn test_example_marker_must_be_at_line_start() {
1457        let content = "Inline (@) is not a marker.\n";
1458        let ranges = detect_example_list_marker_ranges(content);
1459        assert_eq!(ranges.len(), 0);
1460    }
1461
1462    #[test]
1463    fn test_detect_subscript() {
1464        let content = "H~2~O is water.\n";
1465        let ranges = detect_subscript_superscript_ranges(content);
1466        assert_eq!(ranges.len(), 1);
1467        assert_eq!(&content[ranges[0].start..ranges[0].end], "~2~");
1468    }
1469
1470    #[test]
1471    fn test_detect_superscript() {
1472        let content = "2^10^ is 1024.\n";
1473        let ranges = detect_subscript_superscript_ranges(content);
1474        assert_eq!(ranges.len(), 1);
1475        assert_eq!(&content[ranges[0].start..ranges[0].end], "^10^");
1476    }
1477
1478    #[test]
1479    fn test_subscript_does_not_match_strikethrough() {
1480        // `~~text~~` is GFM strikethrough, not subscript.
1481        let content = "This is ~~struck~~.\n";
1482        let ranges = detect_subscript_superscript_ranges(content);
1483        assert_eq!(ranges.len(), 0);
1484    }
1485
1486    #[test]
1487    fn test_superscript_with_internal_space_is_not_matched() {
1488        // Pandoc requires no whitespace inside `^...^`.
1489        let content = "x^a b^ y\n";
1490        let ranges = detect_subscript_superscript_ranges(content);
1491        assert_eq!(ranges.len(), 0);
1492    }
1493
1494    #[test]
1495    fn test_subscript_at_start_of_input() {
1496        // Position 0: previous-byte path uses checked_sub(1).unwrap_or(0).
1497        let content = "~x~ rest of line\n";
1498        let ranges = detect_subscript_superscript_ranges(content);
1499        assert_eq!(ranges.len(), 1);
1500        assert_eq!(&content[ranges[0].start..ranges[0].end], "~x~");
1501    }
1502
1503    #[test]
1504    fn test_superscript_at_end_of_input_no_newline() {
1505        // EOF: next-byte path uses bytes.get(end).unwrap_or(0).
1506        let content = "text ^x^";
1507        let ranges = detect_subscript_superscript_ranges(content);
1508        assert_eq!(ranges.len(), 1);
1509        assert_eq!(&content[ranges[0].start..ranges[0].end], "^x^");
1510    }
1511
1512    #[test]
1513    fn test_detect_inline_code_attribute() {
1514        // `code`{.python} — the {.python} is a Pandoc attribute on inline code.
1515        let content = "Use `print()`{.python} for output.\n";
1516        let ranges = detect_inline_code_attr_ranges(content);
1517        assert_eq!(ranges.len(), 1);
1518        let r = &ranges[0];
1519        assert_eq!(&content[r.start..r.end], "{.python}");
1520    }
1521
1522    #[test]
1523    fn test_inline_code_attribute_only_after_backtick() {
1524        // A bare `{...}` in prose is not an inline code attribute.
1525        let content = "Use {.example} for the class.\n";
1526        let ranges = detect_inline_code_attr_ranges(content);
1527        assert_eq!(ranges.len(), 0);
1528    }
1529
1530    #[test]
1531    fn test_inline_code_attribute_multiple_on_one_line() {
1532        let content = "Use `a`{.x} and `b`{.y} here.\n";
1533        let ranges = detect_inline_code_attr_ranges(content);
1534        assert_eq!(ranges.len(), 2);
1535        assert_eq!(&content[ranges[0].start..ranges[0].end], "{.x}");
1536        assert_eq!(&content[ranges[1].start..ranges[1].end], "{.y}");
1537    }
1538
1539    #[test]
1540    fn test_inline_code_attribute_compound_attributes() {
1541        // Pandoc supports compound attribute blocks: classes, IDs, and key=value pairs.
1542        let content = "Use `code`{.lang #id key=value} here.\n";
1543        let ranges = detect_inline_code_attr_ranges(content);
1544        assert_eq!(ranges.len(), 1);
1545        assert_eq!(&content[ranges[0].start..ranges[0].end], "{.lang #id key=value}");
1546    }
1547
1548    #[test]
1549    fn test_detect_bracketed_span() {
1550        let content = "This is [some text]{.smallcaps} here.\n";
1551        let ranges = detect_bracketed_span_ranges(content);
1552        assert_eq!(ranges.len(), 1);
1553        let r = &ranges[0];
1554        assert_eq!(&content[r.start..r.end], "[some text]{.smallcaps}");
1555    }
1556
1557    #[test]
1558    fn test_bracketed_span_does_not_match_link() {
1559        // `[text](url)` is a link, not a bracketed span.
1560        let content = "A [link](http://example.com) here.\n";
1561        let ranges = detect_bracketed_span_ranges(content);
1562        assert_eq!(ranges.len(), 0);
1563    }
1564
1565    #[test]
1566    fn test_bracketed_span_does_not_match_reference_link() {
1567        // `[text][ref]` is a reference link.
1568        let content = "A [ref][def] here.\n[def]: http://example.com\n";
1569        let ranges = detect_bracketed_span_ranges(content);
1570        assert_eq!(ranges.len(), 0);
1571    }
1572
1573    #[test]
1574    fn test_bracketed_span_multiple_on_one_line() {
1575        let content = "[one]{.a} and [two]{.b} together.\n";
1576        let ranges = detect_bracketed_span_ranges(content);
1577        assert_eq!(ranges.len(), 2);
1578        assert_eq!(&content[ranges[0].start..ranges[0].end], "[one]{.a}");
1579        assert_eq!(&content[ranges[1].start..ranges[1].end], "[two]{.b}");
1580    }
1581
1582    #[test]
1583    fn test_bracketed_span_rejects_empty_content() {
1584        // Both bracket and brace bodies require at least one character.
1585        let content = "[]{.x} and [x]{} here.\n";
1586        let ranges = detect_bracketed_span_ranges(content);
1587        assert_eq!(ranges.len(), 0);
1588    }
1589
1590    #[test]
1591    fn test_bracketed_span_at_start_of_line() {
1592        let content = "[head]{.intro} starts the line.\n";
1593        let ranges = detect_bracketed_span_ranges(content);
1594        assert_eq!(ranges.len(), 1);
1595        assert_eq!(ranges[0].start, 0);
1596        assert_eq!(&content[ranges[0].start..ranges[0].end], "[head]{.intro}");
1597    }
1598
1599    #[test]
1600    fn test_detect_line_block_single() {
1601        let content = "| The Lord of the Rings\n| by J.R.R. Tolkien\n";
1602        let ranges = detect_line_block_ranges(content);
1603        assert_eq!(ranges.len(), 1);
1604        assert_eq!(ranges[0].start, 0);
1605        assert_eq!(ranges[0].end, content.len());
1606    }
1607
1608    #[test]
1609    fn test_line_block_no_trailing_newline() {
1610        // Single-line block with no terminating newline must be flushed.
1611        let content = "| Only line";
1612        let ranges = detect_line_block_ranges(content);
1613        assert_eq!(ranges.len(), 1);
1614        assert_eq!(ranges[0].start, 0);
1615        assert_eq!(ranges[0].end, content.len());
1616    }
1617
1618    #[test]
1619    fn test_line_block_indented_pipe_is_not_continuation() {
1620        // An indented line whose non-whitespace content begins with `|` is
1621        // not a plain-text continuation; it ends the active block.
1622        let content = "| First\n  | indented\n";
1623        let ranges = detect_line_block_ranges(content);
1624        assert_eq!(ranges.len(), 1);
1625        assert_eq!(ranges[0].end, "| First\n".len());
1626    }
1627
1628    #[test]
1629    fn test_line_block_continuation_with_indent() {
1630        // A line starting with whitespace (and NOT `|`) inside a line block is
1631        // a continuation of the previous line.
1632        let content = "| First line\n  continuation\n| Second\n";
1633        let ranges = detect_line_block_ranges(content);
1634        assert_eq!(ranges.len(), 1);
1635    }
1636
1637    #[test]
1638    fn test_line_block_separated_by_blank() {
1639        let content = "| Block A\n\n| Block B\n";
1640        let ranges = detect_line_block_ranges(content);
1641        assert_eq!(ranges.len(), 2);
1642    }
1643
1644    #[test]
1645    fn test_line_block_does_not_match_pipe_table() {
1646        // A `| col |...| row` line ending with `|` is a pipe-table row, not a line block.
1647        let content = "| col1 | col2 |\n|------|------|\n";
1648        let ranges = detect_line_block_ranges(content);
1649        assert_eq!(ranges.len(), 0);
1650    }
1651
1652    #[test]
1653    fn test_detect_pipe_table_caption_below() {
1654        let content = "\
1655| col1 | col2 |
1656|------|------|
1657| a    | b    |
1658
1659: My caption
1660";
1661        let ranges = detect_pipe_table_caption_ranges(content);
1662        assert_eq!(ranges.len(), 1);
1663        let cap = &content[ranges[0].start..ranges[0].end];
1664        assert!(cap.starts_with(": My caption"));
1665    }
1666
1667    #[test]
1668    fn test_detect_pipe_table_caption_above() {
1669        let content = "\
1670: Caption first
1671
1672| col1 | col2 |
1673|------|------|
1674| a    | b    |
1675";
1676        let ranges = detect_pipe_table_caption_ranges(content);
1677        assert_eq!(ranges.len(), 1);
1678    }
1679
1680    #[test]
1681    fn test_colon_line_without_adjacent_table_is_definition_term() {
1682        // A `: text` line not adjacent to a table is part of a definition list.
1683        let content = "Term\n: definition\n";
1684        let ranges = detect_pipe_table_caption_ranges(content);
1685        assert_eq!(ranges.len(), 0);
1686    }
1687
1688    #[test]
1689    fn test_pipe_table_caption_two_blank_lines_does_not_match() {
1690        // Pandoc requires exactly one blank line between table and caption.
1691        let content = "\
1692| a | b |
1693|---|---|
1694| 1 | 2 |
1695
1696
1697: Caption
1698";
1699        let ranges = detect_pipe_table_caption_ranges(content);
1700        assert_eq!(ranges.len(), 0);
1701    }
1702
1703    #[test]
1704    fn test_pipe_table_caption_no_blank_line_does_not_match() {
1705        // Adjacent without a blank line is not a caption either.
1706        let content = "\
1707| a | b |
1708|---|---|
1709| 1 | 2 |
1710: Caption
1711";
1712        let ranges = detect_pipe_table_caption_ranges(content);
1713        assert_eq!(ranges.len(), 0);
1714    }
1715
1716    #[test]
1717    fn test_pipe_table_caption_no_trailing_newline() {
1718        // Caption is the final line of the document with no newline; the
1719        // computed end must equal the content length, not overshoot.
1720        let content = "\
1721| a | b |
1722|---|---|
1723| 1 | 2 |
1724
1725: Trailing caption";
1726        let ranges = detect_pipe_table_caption_ranges(content);
1727        assert_eq!(ranges.len(), 1);
1728        assert_eq!(ranges[0].end, content.len());
1729        assert_eq!(&content[ranges[0].start..ranges[0].end], ": Trailing caption");
1730    }
1731
1732    #[test]
1733    fn test_pipe_table_caption_handles_crlf() {
1734        // CRLF line endings must produce correct byte offsets too.
1735        let content = "| a | b |\r\n|---|---|\r\n| 1 | 2 |\r\n\r\n: CRLF caption\r\n";
1736        let ranges = detect_pipe_table_caption_ranges(content);
1737        assert_eq!(ranges.len(), 1);
1738        let cap = &content[ranges[0].start..ranges[0].end];
1739        assert!(cap.starts_with(": CRLF caption"));
1740    }
1741
1742    #[test]
1743    fn test_pipe_table_caption_lone_colon_does_not_match() {
1744        // Pandoc requires `: ` (colon-space) for a caption; bare `:` is not.
1745        let content = "\
1746| a | b |
1747|---|---|
1748| 1 | 2 |
1749
1750:
1751";
1752        let ranges = detect_pipe_table_caption_ranges(content);
1753        assert_eq!(ranges.len(), 0);
1754    }
1755
1756    #[test]
1757    fn test_detect_metadata_block_at_start() {
1758        // Standard frontmatter case — should be returned as a metadata range.
1759        let content = "---\ntitle: Doc\n---\n\nBody.\n";
1760        let ranges = detect_yaml_metadata_block_ranges(content);
1761        assert_eq!(ranges.len(), 1);
1762        assert_eq!(ranges[0].start, 0);
1763    }
1764
1765    #[test]
1766    fn test_detect_metadata_block_mid_document() {
1767        // Pandoc allows multiple `---...---` metadata blocks anywhere.
1768        let content = "---\ntitle: Doc\n---\n\n# Heading\n\n---\nauthor: X\n---\n\nBody.\n";
1769        let ranges = detect_yaml_metadata_block_ranges(content);
1770        assert_eq!(ranges.len(), 2);
1771    }
1772
1773    #[test]
1774    fn test_metadata_block_uses_dot_terminator() {
1775        // Pandoc accepts `...` as an alternative terminator.
1776        let content = "---\ntitle: Doc\n...\n\nBody.\n";
1777        let ranges = detect_yaml_metadata_block_ranges(content);
1778        assert_eq!(ranges.len(), 1);
1779    }
1780
1781    #[test]
1782    fn test_metadata_block_unterminated_opener_skipped() {
1783        // An opener with no closer reaching EOF must NOT produce a range.
1784        let content = "---\ntitle: Doc\nbody continues forever\n";
1785        let ranges = detect_yaml_metadata_block_ranges(content);
1786        assert_eq!(ranges.len(), 0);
1787    }
1788
1789    #[test]
1790    fn test_metadata_block_dashes_after_text_are_not_opener() {
1791        // A `---` line not preceded by a blank is a horizontal rule,
1792        // not a metadata opener.
1793        let content = "Some prose paragraph.\n---\nbody: not-metadata\n---\n";
1794        let ranges = detect_yaml_metadata_block_ranges(content);
1795        assert_eq!(ranges.len(), 0);
1796    }
1797
1798    #[test]
1799    fn test_metadata_block_no_trailing_newline() {
1800        // Block at end of file with no trailing newline; end must equal
1801        // content length, not overshoot.
1802        let content = "---\ntitle: Doc\n---";
1803        let ranges = detect_yaml_metadata_block_ranges(content);
1804        assert_eq!(ranges.len(), 1);
1805        assert_eq!(ranges[0].start, 0);
1806        assert_eq!(ranges[0].end, content.len());
1807    }
1808
1809    #[test]
1810    fn test_metadata_block_handles_crlf() {
1811        // CRLF endings must produce correct byte offsets.
1812        let content = "---\r\ntitle: Doc\r\n---\r\n\r\nBody.\r\n";
1813        let ranges = detect_yaml_metadata_block_ranges(content);
1814        assert_eq!(ranges.len(), 1);
1815        let block = &content[ranges[0].start..ranges[0].end];
1816        assert!(block.starts_with("---\r\n"));
1817        assert!(block.ends_with("---\r\n"));
1818    }
1819
1820    #[test]
1821    fn test_collect_pandoc_header_slugs_skips_code_blocks() {
1822        let content = "\
1823# Real Heading
1824
1825```bash
1826# This is a bash comment
1827#!/usr/bin/env bash
1828```
1829
1830# Another Heading
1831";
1832        let slugs = collect_pandoc_header_slugs(content);
1833        assert!(slugs.contains("real-heading"));
1834        assert!(slugs.contains("another-heading"));
1835        assert!(!slugs.contains("this-is-a-bash-comment"));
1836        assert!(!slugs.iter().any(|s| s.contains("usr-bin")));
1837    }
1838
1839    #[test]
1840    fn test_detect_simple_grid_table() {
1841        let content = "\
1842+---------+---------+
1843| col1    | col2    |
1844+=========+=========+
1845| a       | b       |
1846+---------+---------+
1847";
1848        let ranges = detect_grid_table_ranges(content);
1849        assert_eq!(ranges.len(), 1);
1850        assert_eq!(ranges[0].start, 0);
1851        assert_eq!(ranges[0].end, content.len());
1852    }
1853
1854    #[test]
1855    fn test_grid_table_with_surrounding_text() {
1856        let content = "\
1857Before.
1858
1859+---+---+
1860| a | b |
1861+---+---+
1862| 1 | 2 |
1863+---+---+
1864
1865After.
1866";
1867        let ranges = detect_grid_table_ranges(content);
1868        assert_eq!(ranges.len(), 1);
1869        let region = &content[ranges[0].start..ranges[0].end];
1870        assert!(region.contains("+---+---+"));
1871        assert!(!region.contains("Before"));
1872        assert!(!region.contains("After"));
1873    }
1874
1875    #[test]
1876    fn test_lone_plus_dash_line_is_not_a_table() {
1877        let content = "Just a +---+ in prose.\n";
1878        let ranges = detect_grid_table_ranges(content);
1879        assert_eq!(ranges.len(), 0);
1880    }
1881
1882    #[test]
1883    fn test_grid_table_no_trailing_newline() {
1884        // Block at end of file with no trailing newline; end must equal
1885        // content length, not overshoot.
1886        let content = "+---+---+\n| a | b |\n+---+---+\n| 1 | 2 |\n+---+---+";
1887        let ranges = detect_grid_table_ranges(content);
1888        assert_eq!(ranges.len(), 1);
1889        assert_eq!(ranges[0].start, 0);
1890        assert_eq!(ranges[0].end, content.len());
1891    }
1892
1893    #[test]
1894    fn test_grid_table_crlf() {
1895        // CRLF endings must produce correct byte offsets.
1896        let content = "+---+---+\r\n| a | b |\r\n+---+---+\r\n| 1 | 2 |\r\n+---+---+\r\n";
1897        let ranges = detect_grid_table_ranges(content);
1898        assert_eq!(ranges.len(), 1);
1899        assert_eq!(ranges[0].start, 0);
1900        assert_eq!(ranges[0].end, content.len());
1901    }
1902
1903    #[test]
1904    fn test_grid_table_borders_only_no_content_row_rejected() {
1905        // Two border lines with no content row must not form a valid table.
1906        let content = "+---+\n+---+\n";
1907        let ranges = detect_grid_table_ranges(content);
1908        assert_eq!(ranges.len(), 0);
1909    }
1910
1911    // -----------------------------------------------------------------------
1912    // Multi-line table tests
1913    // -----------------------------------------------------------------------
1914
1915    #[test]
1916    fn test_detect_multi_line_table() {
1917        let content = "\
1918-------------------------------------------------------------
1919 Centered   Default           Right Left
1920  Header    Aligned         Aligned Aligned
1921----------- ------- --------------- -------------------------
1922   First    row                12.0 Example of a row that
1923                                    spans multiple lines.
1924
1925  Second    row                 5.0 Here's another one. Note
1926                                    the blank line between
1927                                    rows.
1928-------------------------------------------------------------
1929";
1930        let ranges = detect_multi_line_table_ranges(content);
1931        assert_eq!(ranges.len(), 1);
1932        assert_eq!(ranges[0].start, 0);
1933        assert_eq!(ranges[0].end, content.len());
1934    }
1935
1936    #[test]
1937    fn test_simple_dash_header_underline_only_does_not_match() {
1938        // The dash line has length 8 < 10 so it is not a MULTI_LINE_BORDER,
1939        // and it is not a MULTI_LINE_UNDERLINE (only one dash run — no spaces).
1940        let content = "Some text\n--------\nMore text\n";
1941        let ranges = detect_multi_line_table_ranges(content);
1942        assert_eq!(ranges.len(), 0);
1943    }
1944
1945    #[test]
1946    fn test_multi_line_table_no_trailing_newline() {
1947        // The last line has no trailing newline; end must equal content.len().
1948        let content = "\
1949-------------------------------------------------------------
1950 Centered   Default           Right Left
1951  Header    Aligned         Aligned Aligned
1952----------- ------- --------------- -------------------------
1953   First    row                12.0 Example of a row that
1954                                    spans multiple lines.
1955
1956  Second    row                 5.0 Here's another one. Note
1957                                    the blank line between
1958                                    rows.
1959-------------------------------------------------------------";
1960        let ranges = detect_multi_line_table_ranges(content);
1961        assert_eq!(ranges.len(), 1);
1962        assert_eq!(ranges[0].end, content.len());
1963    }
1964
1965    #[test]
1966    fn test_multi_line_table_crlf() {
1967        // CRLF line endings must produce correct byte offsets.
1968        let content = "\
1969-------------------------------------------------------------\r\n\
1970 Centered   Default           Right Left\r\n\
1971  Header    Aligned         Aligned Aligned\r\n\
1972----------- ------- --------------- -------------------------\r\n\
1973   First    row                12.0 Example of a row that\r\n\
1974                                    spans multiple lines.\r\n\
1975\r\n\
1976  Second    row                 5.0 Here's another one. Note\r\n\
1977                                    the blank line between\r\n\
1978                                    rows.\r\n\
1979-------------------------------------------------------------\r\n";
1980        let ranges = detect_multi_line_table_ranges(content);
1981        assert_eq!(ranges.len(), 1);
1982        assert_eq!(ranges[0].start, 0);
1983        assert_eq!(ranges[0].end, content.len());
1984    }
1985
1986    #[test]
1987    fn test_multi_line_table_unterminated_skipped() {
1988        // Header + underline but no closing border — must return 0 ranges.
1989        let content = "\
1990 Centered   Default
1991  Header    Aligned
1992----------- -------
1993   First    row
1994   Second   row
1995";
1996        let ranges = detect_multi_line_table_ranges(content);
1997        assert_eq!(ranges.len(), 0);
1998    }
1999
2000    #[test]
2001    fn test_multi_line_table_no_top_border() {
2002        // Valid table with no top border: header line immediately followed by
2003        // the column underline, then body rows, then closing border.
2004        let content = "\
2005  Centered   Default           Right Left
2006----------- ------- --------------- -------------------------
2007   First    row                12.0 Example
2008  Second    row                 5.0 Another
2009-------------------------------------------------------------
2010";
2011        let ranges = detect_multi_line_table_ranges(content);
2012        assert_eq!(ranges.len(), 1);
2013        assert_eq!(ranges[0].start, 0);
2014        assert_eq!(ranges[0].end, content.len());
2015    }
2016
2017    #[test]
2018    fn test_is_pandoc_raw_block_lang() {
2019        assert!(is_pandoc_raw_block_lang("{=html}"));
2020        assert!(is_pandoc_raw_block_lang("{=latex}"));
2021        assert!(is_pandoc_raw_block_lang("{=docx}"));
2022        assert!(is_pandoc_raw_block_lang("{=rst}"));
2023        // Hyphens and underscores are part of the allowed character set.
2024        assert!(is_pandoc_raw_block_lang("{=open-document}"));
2025        assert!(is_pandoc_raw_block_lang("{=my_format}"));
2026        // Uppercase is accepted (Pandoc itself is case-sensitive but the
2027        // grammar permits any ASCII alphanumeric).
2028        assert!(is_pandoc_raw_block_lang("{=HTML}"));
2029        // Reject Quarto exec blocks.
2030        assert!(!is_pandoc_raw_block_lang("{r}"));
2031        assert!(!is_pandoc_raw_block_lang("{python}"));
2032        // Reject malformed.
2033        assert!(!is_pandoc_raw_block_lang("{=}"));
2034        assert!(!is_pandoc_raw_block_lang("{=  }"));
2035        assert!(!is_pandoc_raw_block_lang("=html"));
2036        // Reject inner whitespace and special characters.
2037        assert!(!is_pandoc_raw_block_lang("{=html }"));
2038        assert!(!is_pandoc_raw_block_lang("{=ht ml}"));
2039    }
2040
2041    #[test]
2042    fn test_is_pandoc_code_class_attr() {
2043        // Single class declares the language.
2044        assert!(is_pandoc_code_class_attr("{.python}"));
2045        assert!(is_pandoc_code_class_attr("{.haskell}"));
2046        assert!(is_pandoc_code_class_attr("{.rust}"));
2047        // Multiple classes — first class is the language, rest are decoration.
2048        assert!(is_pandoc_code_class_attr("{.haskell .numberLines}"));
2049        // Class plus id.
2050        assert!(is_pandoc_code_class_attr("{#myid .python}"));
2051        // Class plus key=value attributes.
2052        assert!(is_pandoc_code_class_attr("{.python startFrom=\"10\"}"));
2053        // Class anywhere in the attribute list.
2054        assert!(is_pandoc_code_class_attr("{#snippet .python startFrom=\"10\"}"));
2055        // Identifiers with hyphens and underscores are valid.
2056        assert!(is_pandoc_code_class_attr("{.objective-c}"));
2057        assert!(is_pandoc_code_class_attr("{.my_lang}"));
2058
2059        // Reject — no class anywhere.
2060        assert!(!is_pandoc_code_class_attr("{}"));
2061        assert!(!is_pandoc_code_class_attr("{#myid}"));
2062        assert!(!is_pandoc_code_class_attr("{startFrom=\"10\"}"));
2063        // Reject — Pandoc raw block (handled by separate predicate).
2064        assert!(!is_pandoc_code_class_attr("{=html}"));
2065        // Reject — Quarto exec syntax (no leading dot).
2066        assert!(!is_pandoc_code_class_attr("{r}"));
2067        assert!(!is_pandoc_code_class_attr("{python}"));
2068        // Reject — bare dot with no identifier.
2069        assert!(!is_pandoc_code_class_attr("{.}"));
2070        // Reject — missing braces.
2071        assert!(!is_pandoc_code_class_attr(".python"));
2072        assert!(!is_pandoc_code_class_attr("python"));
2073    }
2074}
rumdl_lib/utils/pandoc.rs

rumdl_lib/utils/
pandoc.rs