Skip to main content

panache_parser/parser/inlines/
inline_ir.rs

1//! Inline IR for both CommonMark and Pandoc dialects.
2//!
3//! The inline parsing pipeline runs in three passes over an intermediate
4//! representation (IR):
5//!
6//! 1. **Scan** ([`build_ir`]): walk the source bytes once, producing a flat
7//!    [`Vec<IrEvent>`]. Opaque higher-precedence constructs (escapes, code
8//!    spans, autolinks, raw HTML, plus Pandoc math / native spans / inline
9//!    footnotes / footnote references / citations / bracketed spans) are
10//!    skipped past as a single [`IrEvent::Construct`] event whose source
11//!    range is preserved for losslessness. Delimiter runs (`*`/`_`),
12//!    bracket markers (`[`, `![`, `]`), soft line breaks, and plain text
13//!    spans become distinct events.
14//!
15//! 2. **Process brackets** ([`process_brackets`]) — CommonMark §6.3: the
16//!    bracket-stack algorithm walks `]` markers left-to-right. For each
17//!    `]`, the algorithm finds the nearest active opener and tries to
18//!    resolve the pair as a link or image: inline `[text](dest)`, full
19//!    reference `[text][label]`, collapsed `[text][]`, or shortcut
20//!    `[text]`. Under CommonMark, reference forms are validated against
21//!    the document refdef map and a successful match deactivates all
22//!    earlier active openers (§6.3 "links may not contain other links").
23//!    Under Pandoc, reference forms resolve shape-only (any non-empty
24//!    label) and the deactivation pass is skipped; outer-wins nested-link
25//!    semantics are enforced by the emission walk's `suppress_inner_links`
26//!    flag instead.
27//!
28//! 3. **Process emphasis** ([`process_emphasis_in_range`]): the classic
29//!    delimiter-stack algorithm runs over the [`IrEvent::DelimRun`]
30//!    events, pairing openers with closers and recording matches on the
31//!    runs. Runs first scoped per resolved bracket pair (innermost
32//!    first), then a top-level pass over the residual events. Each match
33//!    consumes 1 or 2 inner-edge bytes from each side; leftover bytes
34//!    fall through to literal text. Dialect gates (Pandoc flanking rules,
35//!    mod-3 rejection, asymmetric (1,2)/(2,1) rejection, opener-count >= 4
36//!    rejection, triple-emph nesting flip, cascade-then-rerun) branch on
37//!    the `dialect` parameter.
38//!
39//! The emission walk in [`super::core::parse_inline_range_impl`] consumes
40//! three byte-keyed plans built by [`build_full_plans`]: an
41//! [`EmphasisPlan`] for delim-run dispositions, a [`BracketPlan`] for
42//! resolved link/image bracket pairs, and a [`ConstructPlan`] for
43//! standalone Pandoc constructs (inline footnotes, native spans, footnote
44//! references, citations, bracketed spans). Matched delim runs become
45//! `EMPHASIS` / `STRONG` nodes; matched bracket pairs become `LINK` /
46//! `IMAGE` nodes via the dispatcher's `try_parse_*` recognizers (called
47//! to *parse* a matched range, not to *resolve* it). Unmatched delims and
48//! brackets fall through to plain text.
49
50use crate::options::ParserOptions;
51use crate::parser::inlines::refdef_map::{RefdefMap, normalize_label};
52use std::collections::{BTreeMap, HashSet};
53
54#[derive(Debug, Clone, Copy, PartialEq, Eq)]
55pub enum EmphasisKind {
56    Emph,
57    Strong,
58}
59
60/// Disposition of a single delimiter byte after emphasis resolution.
61#[derive(Debug, Clone, Copy)]
62pub enum DelimChar {
63    /// Start of an opening marker. The marker spans `len` bytes from this
64    /// position; the matching closer starts at `partner` and spans
65    /// `partner_len` bytes.
66    Open {
67        len: u8,
68        partner: usize,
69        partner_len: u8,
70        kind: EmphasisKind,
71    },
72    /// Start of a closing marker. The matching opener starts at `partner`.
73    /// Emission jumps past close markers via the matching `Open` entry, so
74    /// this variant is only consulted defensively.
75    Close,
76    /// Unmatched delimiter byte; emit as literal text.
77    Literal,
78}
79
80/// Byte-keyed disposition map for `*` / `_` delimiter chars produced by
81/// the IR's emphasis pass and consumed by the inline emission walk.
82#[derive(Debug, Default, Clone)]
83pub struct EmphasisPlan {
84    by_pos: BTreeMap<usize, DelimChar>,
85}
86
87impl EmphasisPlan {
88    pub fn lookup(&self, pos: usize) -> Option<DelimChar> {
89        self.by_pos.get(&pos).copied()
90    }
91
92    pub fn is_empty(&self) -> bool {
93        self.by_pos.is_empty()
94    }
95
96    /// Construct an `EmphasisPlan` from a byte-keyed disposition map.
97    pub fn from_dispositions(by_pos: BTreeMap<usize, DelimChar>) -> Self {
98        Self { by_pos }
99    }
100}
101
102use super::bracketed_spans::try_parse_bracketed_span;
103use super::citations::{try_parse_bare_citation, try_parse_bracketed_citation};
104use super::code_spans::try_parse_code_span;
105use super::escapes::{EscapeType, try_parse_escape};
106use super::inline_footnotes::{try_parse_footnote_reference, try_parse_inline_footnote};
107use super::inline_html::try_parse_inline_html;
108use super::links::{
109    LinkScanContext, try_parse_autolink, try_parse_inline_image, try_parse_inline_link,
110    try_parse_reference_image, try_parse_reference_link,
111};
112use super::math::{
113    try_parse_display_math, try_parse_double_backslash_display_math,
114    try_parse_double_backslash_inline_math, try_parse_gfm_inline_math, try_parse_inline_math,
115    try_parse_single_backslash_display_math, try_parse_single_backslash_inline_math,
116};
117use super::native_spans::try_parse_native_span;
118
119/// One event in the inline IR.
120///
121/// Events partition the source byte range covered by the IR exactly: their
122/// `range()` values are contiguous and non-overlapping, so concatenating
123/// them reproduces the original input. This is the losslessness invariant
124/// the emission pass relies on.
125#[derive(Debug, Clone)]
126pub enum IrEvent {
127    /// Plain text byte span. Emitted as a single `TEXT` token, possibly
128    /// merged with adjacent literal-disposition delim/bracket bytes.
129    Text { start: usize, end: usize },
130
131    /// An opaque higher-precedence construct (escape, code span, autolink,
132    /// raw HTML). The emission pass re-parses these from the source byte
133    /// range using the existing per-construct emitters; we don't store a
134    /// pre-built `GreenNode` because `rowan::GreenNodeBuilder` doesn't
135    /// support inserting subtrees directly. The byte range is what makes
136    /// emission well-defined — the construct kind is recovered by the
137    /// emitter dispatching on the leading byte.
138    Construct {
139        start: usize,
140        end: usize,
141        kind: ConstructKind,
142    },
143
144    /// A `*` or `_` delimiter run. The `matches` vec is filled in by
145    /// [`process_emphasis`]; before that pass it is empty.
146    DelimRun {
147        ch: u8,
148        start: usize,
149        end: usize,
150        can_open: bool,
151        can_close: bool,
152        /// Matched fragments produced by `process_emphasis`. Each entry
153        /// is one `(byte_offset_within_run, len, partner_event_idx,
154        /// partner_byte_offset, kind, is_opener)` tuple. Empty until the
155        /// pass runs; possibly multiple entries when a single run matches
156        /// at multiple positions (e.g. a 4-run that closes 2+2 pairs).
157        matches: Vec<DelimMatch>,
158    },
159
160    /// `[` or `![` bracket marker. Resolved by [`process_brackets`].
161    OpenBracket {
162        start: usize,
163        /// `start + 1` for `[`, `start + 2` for `![`.
164        end: usize,
165        is_image: bool,
166        /// True until a later resolution rule deactivates this opener.
167        active: bool,
168        /// Filled in when the matching `CloseBracket` resolves the pair
169        /// to a link / image.
170        resolution: Option<BracketResolution>,
171    },
172
173    /// `]` bracket marker. Resolved by [`process_brackets`].
174    CloseBracket {
175        pos: usize,
176        /// True if this `]` was paired with an opener and the pair was
177        /// turned into a link / image.
178        matched: bool,
179    },
180
181    /// A soft line break (a `\n` or `\r\n` ending a paragraph-internal
182    /// line). Includes the line-ending bytes verbatim.
183    SoftBreak { start: usize, end: usize },
184
185    /// A hard line break (`  \n` / `\\\n` / `   \n` etc.). Includes any
186    /// trailing-space bytes plus the line ending.
187    HardBreak { start: usize, end: usize },
188}
189
190impl IrEvent {
191    /// The source byte range this event covers.
192    pub fn range(&self) -> (usize, usize) {
193        match self {
194            IrEvent::Text { start, end }
195            | IrEvent::Construct { start, end, .. }
196            | IrEvent::DelimRun { start, end, .. }
197            | IrEvent::OpenBracket { start, end, .. }
198            | IrEvent::SoftBreak { start, end }
199            | IrEvent::HardBreak { start, end } => (*start, *end),
200            IrEvent::CloseBracket { pos, .. } => (*pos, *pos + 1),
201        }
202    }
203}
204
205/// Categorical tag for a [`IrEvent::Construct`] event so emission knows
206/// which parser to call to rebuild the CST subtree.
207#[derive(Debug, Clone, Copy, PartialEq, Eq)]
208pub enum ConstructKind {
209    /// `\X` literal-character escape (CommonMark §2.4).
210    Escape,
211    /// `` `code` `` span (§6.1).
212    CodeSpan,
213    /// `<scheme://...>` or `<email@host>` (§6.5).
214    Autolink,
215    /// `<tag ...>` and friends (§6.6).
216    InlineHtml,
217    /// Pandoc opaque construct that doesn't have a dedicated kind yet
218    /// (currently: math spans). Pre-recognised in `build_ir` under
219    /// `Dialect::Pandoc` solely so the emphasis pass treats the entire
220    /// construct as opaque and delim runs inside don't cross its
221    /// boundary. Emission re-parses the construct via the dispatcher's
222    /// existing `try_parse_*` chain.
223    PandocOpaque,
224    /// Pandoc inline footnote `^[note text]`. Recognised in `build_ir`
225    /// under `Dialect::Pandoc` and consumed by the emission walk via
226    /// the IR's `ConstructPlan`. The dispatcher's legacy `^[` branch
227    /// is gated to CommonMark dialect only.
228    InlineFootnote,
229    /// Pandoc native span `<span ...>...</span>`. Recognised in
230    /// `build_ir` under `Dialect::Pandoc` and consumed by the emission
231    /// walk via the IR's `ConstructPlan`. The dispatcher's legacy
232    /// `<span>` branch is gated to CommonMark dialect only.
233    NativeSpan,
234    /// Pandoc footnote reference `[^id]`. Recognised in `build_ir`
235    /// under `Dialect::Pandoc` and consumed by the emission walk via
236    /// the IR's `ConstructPlan`. The dispatcher's legacy `[^id]`
237    /// branch is gated to CommonMark dialect only.
238    FootnoteReference,
239    /// Pandoc bracketed citation `[@key]`, `[see @key, p. 1]`,
240    /// `[@a; @b]`. Recognised in `build_ir` under `Dialect::Pandoc`
241    /// and consumed by the emission walk via the IR's `ConstructPlan`.
242    /// The dispatcher's legacy `[@cite]` branch is gated to CommonMark
243    /// dialect only.
244    BracketedCitation,
245    /// Pandoc bare citation `@key` or `-@key` (author-in-text /
246    /// suppress-author). Recognised in `build_ir` under
247    /// `Dialect::Pandoc` and consumed by the emission walk via the
248    /// IR's `ConstructPlan`. The dispatcher's legacy `@` and `-@`
249    /// branches are gated to CommonMark dialect only.
250    BareCitation,
251    /// Pandoc bracketed span `[content]{attrs}`. Recognised in
252    /// `build_ir` under `Dialect::Pandoc` and consumed by the emission
253    /// walk via the IR's `ConstructPlan`. The dispatcher's legacy
254    /// `[text]{attrs}` branch is gated to CommonMark dialect only.
255    BracketedSpan,
256}
257
258/// One matched fragment within a [`IrEvent::DelimRun`].
259#[derive(Debug, Clone, Copy)]
260pub struct DelimMatch {
261    /// Byte offset of this fragment relative to the run's `start`.
262    pub offset_in_run: u8,
263    /// Number of bytes in this fragment (1 or 2).
264    pub len: u8,
265    /// Whether this fragment is the opener (`true`) or closer of the pair.
266    pub is_opener: bool,
267    /// IR event index of the partner run.
268    pub partner_event: u32,
269    /// Byte offset within the partner run of the partner fragment.
270    pub partner_offset: u8,
271    /// Emphasis kind (Emph for `len == 1`, Strong for `len == 2`).
272    pub kind: EmphasisKind,
273}
274
275/// Successful bracket resolution: the `[`...`]` pair is a link or image.
276#[derive(Debug, Clone)]
277pub struct BracketResolution {
278    /// IR event index of the matching `CloseBracket`.
279    pub close_event: u32,
280    /// Source range of the link text (between `[`/`![` and `]`).
281    pub text_start: usize,
282    pub text_end: usize,
283    /// Source range of the link suffix (`(...)`, `[label]`, `[]`, or
284    /// empty for shortcut). When `kind == ShortcutReference`,
285    /// `suffix_start == suffix_end == close_pos + 1`.
286    pub suffix_start: usize,
287    pub suffix_end: usize,
288    pub kind: LinkKind,
289}
290
291/// What kind of link/image we resolved a bracket pair to.
292#[derive(Debug, Clone)]
293pub enum LinkKind {
294    /// `[text](dest)` or `[text](dest "title")`.
295    Inline { dest: String, title: Option<String> },
296    /// `[text][label]` — explicit reference.
297    FullReference { label: String },
298    /// `[text][]` — collapsed reference. Label is the link text.
299    CollapsedReference,
300    /// `[text]` — shortcut reference. Label is the link text.
301    ShortcutReference,
302}
303
304// ============================================================================
305// Pass 1: Scan
306// ============================================================================
307
308/// Scan `text[start..end]` once, producing a flat IR of events.
309///
310/// The scan is forward-only and never backtracks: each iteration either
311/// consumes a known construct (escape, code span, autolink, raw HTML),
312/// records a delim run / bracket marker / line break, or steps past a
313/// single UTF-8 boundary as plain text. Adjacent text bytes are coalesced
314/// into a single [`IrEvent::Text`] event by the run-flush step.
315pub fn build_ir(text: &str, start: usize, end: usize, config: &ParserOptions) -> Vec<IrEvent> {
316    let mut events = Vec::new();
317    build_ir_into(text, start, end, config, &mut events);
318    events
319}
320
321/// Like [`build_ir`] but writes into a caller-provided `Vec<IrEvent>`,
322/// clearing it first. Used by [`build_full_plans`] to amortise the
323/// per-call allocation through a thread-local scratch pool.
324pub(super) fn build_ir_into(
325    text: &str,
326    start: usize,
327    end: usize,
328    config: &ParserOptions,
329    events: &mut Vec<IrEvent>,
330) {
331    events.clear();
332    let bytes = text.as_bytes();
333    let exts = &config.extensions;
334    let is_commonmark = config.dialect == crate::options::Dialect::CommonMark;
335
336    let mut pos = start;
337    let mut text_run_start = start;
338    // Pandoc-only: extent of the current bracket-shape link/image's
339    // opaque range. While `pos < pandoc_bracket_extent`, autolinks /
340    // raw HTML / native spans are NOT recognised — pandoc-native
341    // treats `[link text]` as opaque to those constructs (CommonMark
342    // spec example #526 / #538). The lookahead at `[`/`![` sets this
343    // when a bracket-shape forms a valid link/image; once `pos`
344    // passes the extent, normal scanning resumes. CommonMark
345    // dialect's link-text-vs-autolink ordering is handled by the
346    // dispatcher's `try_parse_inline_link` rejecting outer matches
347    // when the link text contains a valid autolink (a different
348    // mechanism, see `LinkScanContext.skip_autolinks`).
349    let mut pandoc_bracket_extent: usize = 0;
350
351    // Pre-computed byte mask: `mask[b]` is `true` iff byte `b` could
352    // start any IR-recognised construct under the current dialect /
353    // extensions. Used to bulk-skip plain bytes between structural
354    // bytes — the per-byte branch chain below only runs at positions
355    // where a construct is actually possible. Non-ASCII bytes
356    // (>= 0x80) are never structural and are skipped together with
357    // ASCII plain text.
358    let mask = build_ir_byte_mask(config);
359
360    macro_rules! flush_text {
361        () => {
362            if pos > text_run_start {
363                events.push(IrEvent::Text {
364                    start: text_run_start,
365                    end: pos,
366                });
367            }
368        };
369    }
370
371    while pos < end {
372        // Fast-skip plain bytes. `text_run_start` is preserved across
373        // the skip so the next structural-event flush picks them up.
374        while pos < end && !mask[bytes[pos] as usize] {
375            pos += 1;
376        }
377        if pos >= end {
378            break;
379        }
380        let b = bytes[pos];
381
382        // Pandoc-only: at `[` or `![`, look ahead to see if this
383        // bracket-shape forms a valid link/image. If so, suppress
384        // autolink / raw HTML / native span recognition until `pos`
385        // passes the bracket-shape's end. Skipped if we're already
386        // inside an enclosing bracket-shape's opaque range.
387        if !is_commonmark
388            && pos >= pandoc_bracket_extent
389            && (b == b'[' || (b == b'!' && pos + 1 < end && bytes[pos + 1] == b'['))
390            && let Some(len) = try_pandoc_bracket_link_extent(text, pos, end, config)
391        {
392            pandoc_bracket_extent = pos + len;
393        }
394        let in_pandoc_bracket = !is_commonmark && pos < pandoc_bracket_extent;
395
396        // Backslash escape (§2.4) — including `\\\n` hard line break.
397        if b == b'\\'
398            && let Some((len, _ch, escape_type)) = try_parse_escape(&text[pos..])
399            && pos + len <= end
400        {
401            let enabled = match escape_type {
402                EscapeType::Literal => is_commonmark || exts.all_symbols_escapable,
403                EscapeType::HardLineBreak => exts.escaped_line_breaks,
404                EscapeType::NonbreakingSpace => exts.all_symbols_escapable,
405            };
406            if enabled {
407                flush_text!();
408                let kind = match escape_type {
409                    EscapeType::HardLineBreak => {
410                        events.push(IrEvent::HardBreak {
411                            start: pos,
412                            end: pos + len,
413                        });
414                        pos += len;
415                        text_run_start = pos;
416                        continue;
417                    }
418                    EscapeType::Literal | EscapeType::NonbreakingSpace => ConstructKind::Escape,
419                };
420                events.push(IrEvent::Construct {
421                    start: pos,
422                    end: pos + len,
423                    kind,
424                });
425                pos += len;
426                text_run_start = pos;
427                continue;
428            }
429        }
430
431        // Code span (§6.1) — opaque to emphasis and brackets.
432        if b == b'`'
433            && let Some((len, _, _, _)) = try_parse_code_span(&text[pos..])
434            && pos + len <= end
435        {
436            flush_text!();
437            events.push(IrEvent::Construct {
438                start: pos,
439                end: pos + len,
440                kind: ConstructKind::CodeSpan,
441            });
442            pos += len;
443            text_run_start = pos;
444            continue;
445        }
446
447        // Pandoc-only: math spans are opaque to emphasis. The legacy
448        // `parse_until_closer_with_nested_*` skip-list includes inline
449        // math; without recognising it here, delim runs inside `$math$`
450        // would be picked up by the emphasis pass and break losslessness
451        // (the dispatcher's math parser would later re-claim the bytes,
452        // duplicating content).
453        if !is_commonmark && let Some(len) = try_pandoc_math_opaque(text, pos, end, config) {
454            flush_text!();
455            events.push(IrEvent::Construct {
456                start: pos,
457                end: pos + len,
458                kind: ConstructKind::PandocOpaque,
459            });
460            pos += len;
461            text_run_start = pos;
462            continue;
463        }
464
465        // Pandoc-only: native span `<span ...>...</span>`. Must come
466        // before the generic autolink/raw-html branches so the open tag
467        // doesn't get claimed as inline HTML. Span content is opaque to
468        // the emphasis pass; emission consumes the event via the IR's
469        // `ConstructPlan`. Suppressed inside Pandoc bracket-shape
470        // link/image text.
471        if !is_commonmark
472            && !in_pandoc_bracket
473            && b == b'<'
474            && exts.native_spans
475            && let Some((len, _, _)) = try_parse_native_span(&text[pos..])
476            && pos + len <= end
477        {
478            flush_text!();
479            events.push(IrEvent::Construct {
480                start: pos,
481                end: pos + len,
482                kind: ConstructKind::NativeSpan,
483            });
484            pos += len;
485            text_run_start = pos;
486            continue;
487        }
488
489        // Autolink (§6.5) before raw HTML — autolinks are the more
490        // specific shape inside `<...>`. Both are suppressed inside
491        // Pandoc bracket-shape link/image text (pandoc-native treats
492        // link text as opaque to autolinks and raw HTML).
493        if b == b'<' && !in_pandoc_bracket {
494            if exts.autolinks
495                && let Some((len, _)) = try_parse_autolink(&text[pos..], is_commonmark)
496                && pos + len <= end
497            {
498                flush_text!();
499                events.push(IrEvent::Construct {
500                    start: pos,
501                    end: pos + len,
502                    kind: ConstructKind::Autolink,
503                });
504                pos += len;
505                text_run_start = pos;
506                continue;
507            }
508            if exts.raw_html
509                && let Some(len) = try_parse_inline_html(&text[pos..])
510                && pos + len <= end
511            {
512                flush_text!();
513                events.push(IrEvent::Construct {
514                    start: pos,
515                    end: pos + len,
516                    kind: ConstructKind::InlineHtml,
517                });
518                pos += len;
519                text_run_start = pos;
520                continue;
521            }
522        }
523
524        // Pandoc-only: inline footnote `^[note]`. Recognized at scan
525        // time so the emphasis pass treats it as opaque (delim runs
526        // inside the footnote can't pair with delim runs outside).
527        if !is_commonmark
528            && b == b'^'
529            && exts.inline_footnotes
530            && let Some((len, _)) = try_parse_inline_footnote(&text[pos..])
531            && pos + len <= end
532        {
533            flush_text!();
534            events.push(IrEvent::Construct {
535                start: pos,
536                end: pos + len,
537                kind: ConstructKind::InlineFootnote,
538            });
539            pos += len;
540            text_run_start = pos;
541            continue;
542        }
543
544        // Pandoc-only: footnote reference `[^id]`. Recognised at scan
545        // time so the emphasis pass treats it as opaque (delim runs
546        // inside the label can't pair with delim runs outside) and the
547        // emission walk dispatches it directly via the IR's
548        // `ConstructPlan`. Must come before the generic bracket-opaque
549        // scan so the dedicated kind wins.
550        if !is_commonmark
551            && b == b'['
552            && pos + 1 < end
553            && bytes[pos + 1] == b'^'
554            && exts.footnotes
555            && let Some((len, _)) = try_parse_footnote_reference(&text[pos..])
556            && pos + len <= end
557        {
558            flush_text!();
559            events.push(IrEvent::Construct {
560                start: pos,
561                end: pos + len,
562                kind: ConstructKind::FootnoteReference,
563            });
564            pos += len;
565            text_run_start = pos;
566            continue;
567        }
568
569        // Pandoc-only: bracketed citation `[@cite]`. Recognised at
570        // scan time so the emphasis pass treats it as opaque (delim
571        // runs inside the citation can't pair with delim runs outside)
572        // and the emission walk dispatches it directly via the IR's
573        // `ConstructPlan`. Must come before the generic bracket-opaque
574        // scan so the dedicated kind wins.
575        if !is_commonmark
576            && b == b'['
577            && exts.citations
578            && let Some((len, _)) = try_parse_bracketed_citation(&text[pos..])
579            && pos + len <= end
580        {
581            flush_text!();
582            events.push(IrEvent::Construct {
583                start: pos,
584                end: pos + len,
585                kind: ConstructKind::BracketedCitation,
586            });
587            pos += len;
588            text_run_start = pos;
589            continue;
590        }
591
592        // Pandoc-only: bare citation `@key` or `-@key`. Recognised at
593        // scan time so the emission walk dispatches it directly via
594        // the IR's `ConstructPlan`. Bare citations don't contain
595        // emphasis-eligible content, so opacity is moot here — IR
596        // participation is only for dispatch consolidation.
597        if !is_commonmark
598            && (b == b'@' || (b == b'-' && pos + 1 < end && bytes[pos + 1] == b'@'))
599            && (exts.citations || exts.quarto_crossrefs)
600            && let Some((len, _, _)) = try_parse_bare_citation(&text[pos..])
601            && pos + len <= end
602        {
603            flush_text!();
604            events.push(IrEvent::Construct {
605                start: pos,
606                end: pos + len,
607                kind: ConstructKind::BareCitation,
608            });
609            pos += len;
610            text_run_start = pos;
611            continue;
612        }
613
614        // Pandoc-only: bracketed span `[content]{attrs}`. Recognised
615        // at scan time so the emphasis pass treats it as opaque (delim
616        // runs inside the span content can't pair with delim runs
617        // outside) and the emission walk dispatches it directly via
618        // the IR's `ConstructPlan`. Must come before the generic
619        // bracket-opaque scan so the dedicated kind wins.
620        // `try_parse_bracketed_span` requires `]` to be immediately
621        // followed by `{`, so this never shadows inline links
622        // (`[text](url)`) or reference links (`[label][refdef]`) —
623        // those don't have the `{attrs}` suffix.
624        if !is_commonmark
625            && b == b'['
626            && exts.bracketed_spans
627            && let Some((len, _, _)) = try_parse_bracketed_span(&text[pos..])
628            && pos + len <= end
629        {
630            flush_text!();
631            events.push(IrEvent::Construct {
632                start: pos,
633                end: pos + len,
634                kind: ConstructKind::BracketedSpan,
635            });
636            pos += len;
637            text_run_start = pos;
638            continue;
639        }
640
641        // `![` opens an image bracket. Recognised whenever any
642        // image-producing extension is on — `inline_images` for the
643        // `![alt](url)` form, or `reference_links` for the
644        // `![alt][label]` reference-image form (e.g. MultiMarkdown
645        // disables `inline_images` but uses reference images).
646        if b == b'!'
647            && pos + 1 < end
648            && bytes[pos + 1] == b'['
649            && (exts.inline_images || exts.reference_links)
650        {
651            flush_text!();
652            events.push(IrEvent::OpenBracket {
653                start: pos,
654                end: pos + 2,
655                is_image: true,
656                active: true,
657                resolution: None,
658            });
659            pos += 2;
660            text_run_start = pos;
661            continue;
662        }
663
664        // `[` opens a link bracket. Recognised whenever any
665        // link-producing extension is on — `inline_links` for
666        // `[text](url)`, or `reference_links` for `[text][label]` /
667        // `[text]` shortcut form.
668        if b == b'[' && (exts.inline_links || exts.reference_links) {
669            flush_text!();
670            events.push(IrEvent::OpenBracket {
671                start: pos,
672                end: pos + 1,
673                is_image: false,
674                active: true,
675                resolution: None,
676            });
677            pos += 1;
678            text_run_start = pos;
679            continue;
680        }
681
682        // `]` closes a link/image bracket.
683        if b == b']' {
684            flush_text!();
685            events.push(IrEvent::CloseBracket {
686                pos,
687                matched: false,
688            });
689            pos += 1;
690            text_run_start = pos;
691            continue;
692        }
693
694        // `*` or `_` delimiter run.
695        if b == b'*' || b == b'_' {
696            flush_text!();
697            let mut run_end = pos;
698            while run_end < end && bytes[run_end] == b {
699                run_end += 1;
700            }
701            let count = run_end - pos;
702            let (can_open, can_close) = compute_flanking(text, pos, count, b, config.dialect);
703            events.push(IrEvent::DelimRun {
704                ch: b,
705                start: pos,
706                end: run_end,
707                can_open,
708                can_close,
709                matches: Vec::new(),
710            });
711            pos = run_end;
712            text_run_start = pos;
713            continue;
714        }
715
716        // Hard line break: 2+ trailing spaces before newline. We detect
717        // this when we're sitting on a `\n` (or `\r\n`) and the preceding
718        // bytes within the current text run are spaces.
719        if b == b'\n' || (b == b'\r' && pos + 1 < end && bytes[pos + 1] == b'\n') {
720            // Count trailing spaces in the text accumulated so far.
721            let nl_len = if b == b'\r' { 2 } else { 1 };
722            let mut trailing_spaces = 0;
723            let mut s = pos;
724            while s > text_run_start && bytes[s - 1] == b' ' {
725                trailing_spaces += 1;
726                s -= 1;
727            }
728            if trailing_spaces >= 2 {
729                // Flush text *before* the trailing spaces.
730                if s > text_run_start {
731                    events.push(IrEvent::Text {
732                        start: text_run_start,
733                        end: s,
734                    });
735                }
736                events.push(IrEvent::HardBreak {
737                    start: s,
738                    end: pos + nl_len,
739                });
740                pos += nl_len;
741                text_run_start = pos;
742                continue;
743            }
744
745            // Soft line break: flush preceding text, emit the line ending
746            // as its own event so the emitter can render `NEWLINE` tokens
747            // verbatim.
748            flush_text!();
749            events.push(IrEvent::SoftBreak {
750                start: pos,
751                end: pos + nl_len,
752            });
753            pos += nl_len;
754            text_run_start = pos;
755            continue;
756        }
757
758        // Plain byte — advance one UTF-8 char.
759        let ch_len = text[pos..]
760            .chars()
761            .next()
762            .map_or(1, std::primitive::char::len_utf8);
763        pos += ch_len.max(1);
764    }
765
766    flush_text!();
767}
768
769/// Build a 256-entry mask: `mask[b]` is `true` iff byte `b` could start
770/// any IR-recognised construct under the current dialect / extensions.
771///
772/// This is the build-IR-specific superset of "is this byte interesting".
773/// Plain bytes between structural bytes are bulk-skipped via this mask
774/// in the [`build_ir`] hot loop; missing a byte here is a correctness
775/// bug (we'd skip past a real construct), but having extras only costs
776/// us a wasted branch round-trip.
777fn build_ir_byte_mask(config: &ParserOptions) -> [bool; 256] {
778    let mut mask = [false; 256];
779    let exts = &config.extensions;
780    let is_commonmark = config.dialect == crate::options::Dialect::CommonMark;
781
782    // Always structural for IR scanning:
783    //   `\n` / `\r` — soft / hard breaks
784    //   `\\`        — escape, hard line break, backslash math
785    //   `` ` ``     — code span (IR construct)
786    //   `*` / `_`   — emphasis delim runs (IR core)
787    mask[b'\n' as usize] = true;
788    mask[b'\r' as usize] = true;
789    mask[b'\\' as usize] = true;
790    mask[b'`' as usize] = true;
791    mask[b'*' as usize] = true;
792    mask[b'_' as usize] = true;
793
794    // Brackets: scanned whenever any bracket-shaped construct is
795    // reachable. `]` is structural unconditionally if `[` is — the IR
796    // emits a CloseBracket event regardless of which opener variant
797    // matches. `!` is gated on image-producing extensions; the leading
798    // `!` of `![alt]` is the only image entry point.
799    if exts.inline_links
800        || exts.reference_links
801        || exts.inline_images
802        || exts.bracketed_spans
803        || exts.footnotes
804        || exts.citations
805    {
806        mask[b'[' as usize] = true;
807        mask[b']' as usize] = true;
808    }
809    if exts.inline_images || exts.reference_links {
810        mask[b'!' as usize] = true;
811    }
812
813    // `<` covers autolinks, raw HTML, and Pandoc native spans.
814    if exts.autolinks || exts.raw_html || (!is_commonmark && exts.native_spans) {
815        mask[b'<' as usize] = true;
816    }
817
818    // `^` covers Pandoc inline footnotes (`^[...]` recognised in IR
819    // under Pandoc dialect). CM dialect inline footnotes go through
820    // the dispatcher, not the IR.
821    if !is_commonmark && exts.inline_footnotes {
822        mask[b'^' as usize] = true;
823    }
824
825    // `@` covers Pandoc bare citation `@key` and `[@cite]`. The leading
826    // `[` of `[@cite]` is already in the mask via the bracket gate;
827    // gating `@` here also covers the bare-citation form.
828    if !is_commonmark && (exts.citations || exts.quarto_crossrefs) {
829        mask[b'@' as usize] = true;
830        // `-` only matters as the first byte of `-@cite`. Tracking it
831        // here avoids missing the suppress-author bare citation form.
832        mask[b'-' as usize] = true;
833    }
834
835    // `$` covers Pandoc dollar / GFM math. CM doesn't recognise math
836    // in `build_ir`.
837    if !is_commonmark
838        && (exts.tex_math_dollars
839            || exts.tex_math_gfm
840            || exts.tex_math_single_backslash
841            || exts.tex_math_double_backslash)
842    {
843        mask[b'$' as usize] = true;
844    }
845
846    mask
847}
848
849// ============================================================================
850// Flanking (CommonMark §6.2)
851// ============================================================================
852
853fn compute_flanking(
854    text: &str,
855    pos: usize,
856    count: usize,
857    ch: u8,
858    dialect: crate::options::Dialect,
859) -> (bool, bool) {
860    if dialect == crate::options::Dialect::Pandoc {
861        // Pandoc-markdown's recursive-descent emphasis parser does NOT
862        // apply CommonMark §6.2 flanking rules. Instead it gates on:
863        //   - opener: must not be followed by whitespace (Pandoc
864        //     `try_parse_emphasis` line 247 in legacy core.rs).
865        //   - closer: no flanking gate at all (Pandoc-markdown's
866        //     `ender` parser only counts characters; see Markdown.hs
867        //     in pandoc/src/Text/Pandoc/Readers/Markdown.hs).
868        //   - underscore intraword hard rule: `_` adjacent to an
869        //     alphanumeric on either side cannot open / close
870        //     (Pandoc's `intraword_underscores` extension default).
871        let prev_char = (pos > 0).then(|| text[..pos].chars().last()).flatten();
872        let next_char = text.get(pos + count..).and_then(|s| s.chars().next());
873        let followed_by_ws = next_char.is_none_or(|c| c.is_whitespace());
874
875        let mut can_open = !followed_by_ws;
876        // Pandoc-markdown's `ender` (in pandoc/Readers/Markdown.hs)
877        // has no flanking restriction on closers — just a count match.
878        // Set can_close unconditionally; the per-pair match logic in
879        // `process_emphasis_in_range_filtered` constrains pairing via
880        // the equal-count rule.
881        let mut can_close = true;
882
883        if ch == b'_' {
884            let prev_is_alnum = prev_char.is_some_and(|c| c.is_alphanumeric());
885            let next_is_alnum = next_char.is_some_and(|c| c.is_alphanumeric());
886            if prev_is_alnum {
887                can_open = false;
888            }
889            if next_is_alnum {
890                can_close = false;
891            }
892        }
893
894        return (can_open, can_close);
895    }
896
897    // CommonMark §6.2 flanking.
898    let lf = is_left_flanking(text, pos, count);
899    let rf = is_right_flanking(text, pos, count);
900    if ch == b'*' {
901        (lf, rf)
902    } else {
903        let prev_char = (pos > 0).then(|| text[..pos].chars().last()).flatten();
904        let next_char = text.get(pos + count..).and_then(|s| s.chars().next());
905        let preceded_by_punct = prev_char.is_some_and(is_unicode_punct_or_symbol);
906        let followed_by_punct = next_char.is_some_and(is_unicode_punct_or_symbol);
907        let can_open = lf && (!rf || preceded_by_punct);
908        let can_close = rf && (!lf || followed_by_punct);
909        (can_open, can_close)
910    }
911}
912
913/// Pandoc-only: identify a math span starting at `pos` and return its
914/// byte length. Tries `$math$` and `$$display$$` (gated on
915/// `tex_math_dollars`), GFM `$math$` (gated on `tex_math_gfm`), and the
916/// `\(math\)` / `\[math\]` / `\\(math\\)` / `\\[math\\]` backslash
917/// forms (gated on `tex_math_single_backslash` / `_double_backslash`).
918/// Math content is opaque to emphasis: `$a * b$` must not produce an
919/// emphasis closer at the inner `*`.
920fn try_pandoc_math_opaque(
921    text: &str,
922    pos: usize,
923    end: usize,
924    config: &ParserOptions,
925) -> Option<usize> {
926    let bytes = text.as_bytes();
927    let exts = &config.extensions;
928    let b = bytes[pos];
929
930    if exts.tex_math_dollars && b == b'$' {
931        if let Some((len, _)) = try_parse_display_math(&text[pos..])
932            && pos + len <= end
933        {
934            return Some(len);
935        }
936        if let Some((len, _)) = try_parse_inline_math(&text[pos..])
937            && pos + len <= end
938        {
939            return Some(len);
940        }
941    }
942    if exts.tex_math_gfm
943        && b == b'$'
944        && let Some((len, _)) = try_parse_gfm_inline_math(&text[pos..])
945        && pos + len <= end
946    {
947        return Some(len);
948    }
949    if exts.tex_math_double_backslash && b == b'\\' {
950        if let Some((len, _)) = try_parse_double_backslash_display_math(&text[pos..])
951            && pos + len <= end
952        {
953            return Some(len);
954        }
955        if let Some((len, _)) = try_parse_double_backslash_inline_math(&text[pos..])
956            && pos + len <= end
957        {
958            return Some(len);
959        }
960    }
961    if exts.tex_math_single_backslash && b == b'\\' {
962        if let Some((len, _)) = try_parse_single_backslash_display_math(&text[pos..])
963            && pos + len <= end
964        {
965            return Some(len);
966        }
967        if let Some((len, _)) = try_parse_single_backslash_inline_math(&text[pos..])
968            && pos + len <= end
969        {
970            return Some(len);
971        }
972    }
973    None
974}
975
976/// Pandoc-only: identify a bracket-shaped opaque construct starting at
977/// `pos` and return its byte length. Tries the dispatcher's precedence
978/// order:
979///   1. `![alt](dest)` inline image
980///   2. `![alt][ref]` / `![alt]` reference image (shape-only opacity)
981///   3. `[^id]` footnote reference
982///   4. `[text](dest)` inline link
983///   5. `[text][ref]` / `[text]` reference link (shape-only opacity)
984///   6. `[@cite]` bracketed citation
985///   7. `[text]{attrs}` bracketed span
986///
987/// Returns `None` if the bytes at `pos` don't open any recognised Pandoc
988/// bracket-shaped construct. In that case the scanner falls through to
989/// the generic `OpenBracket`/`CloseBracket` emission and the dispatcher
990/// emits the bracket bytes as literal text (or as plain emphasis if the
991/// pattern matches an opener).
992/// Lookahead helper: at a `[` or `![` byte under Pandoc dialect, return
993/// the total byte length of the bracket-shape link/image if it forms a
994/// valid one, else `None`. Used by `build_ir` to suppress autolink /
995/// raw HTML / native span recognition inside Pandoc link text —
996/// pandoc-native treats link text as opaque to those constructs
997/// (CommonMark spec example #526 / #538 differs). Mirrors the
998/// dispatcher's `try_parse_*` precedence so the lookahead, the IR's
999/// `process_brackets` resolution, and the dispatcher's emission agree
1000/// on the bracket-shape's byte boundaries.
1001fn try_pandoc_bracket_link_extent(
1002    text: &str,
1003    pos: usize,
1004    end: usize,
1005    config: &ParserOptions,
1006) -> Option<usize> {
1007    let bytes = text.as_bytes();
1008    let exts = &config.extensions;
1009    let ctx = LinkScanContext::from_options(config);
1010    let allow_shortcut = exts.shortcut_reference_links;
1011
1012    // `![...]` images.
1013    if bytes[pos] == b'!' {
1014        if pos + 1 >= end || bytes[pos + 1] != b'[' {
1015            return None;
1016        }
1017        if exts.inline_images
1018            && let Some((len, _, _, _)) = try_parse_inline_image(&text[pos..], ctx)
1019            && pos + len <= end
1020        {
1021            return Some(len);
1022        }
1023        if exts.reference_links
1024            && let Some((len, _, _, _)) = try_parse_reference_image(&text[pos..], allow_shortcut)
1025            && pos + len <= end
1026        {
1027            return Some(len);
1028        }
1029        return None;
1030    }
1031
1032    // `[...]` openers — try in dispatcher order. Footnote refs
1033    // (`[^id]`), bracketed citations (`[@cite]`), and bracketed spans
1034    // (`[text]{attrs}`) are recognised by their own dedicated branches
1035    // in `build_ir` and don't need this lookahead.
1036    if exts.inline_links
1037        && let Some((len, _, _, _)) = try_parse_inline_link(&text[pos..], false, ctx)
1038        && pos + len <= end
1039    {
1040        return Some(len);
1041    }
1042    if exts.reference_links
1043        && let Some((len, _, _, _)) =
1044            try_parse_reference_link(&text[pos..], allow_shortcut, exts.inline_links, ctx)
1045        && pos + len <= end
1046    {
1047        return Some(len);
1048    }
1049
1050    None
1051}
1052
1053fn is_unicode_punct_or_symbol(c: char) -> bool {
1054    if c.is_ascii() {
1055        c.is_ascii_punctuation()
1056    } else {
1057        !c.is_alphanumeric() && !c.is_whitespace()
1058    }
1059}
1060
1061fn is_left_flanking(text: &str, run_start: usize, run_len: usize) -> bool {
1062    let after = run_start + run_len;
1063    let next_char = text.get(after..).and_then(|s| s.chars().next());
1064    let prev_char = (run_start > 0)
1065        .then(|| text[..run_start].chars().last())
1066        .flatten();
1067
1068    let followed_by_ws = next_char.is_none_or(|c| c.is_whitespace());
1069    if followed_by_ws {
1070        return false;
1071    }
1072    let followed_by_punct = next_char.is_some_and(is_unicode_punct_or_symbol);
1073    if !followed_by_punct {
1074        return true;
1075    }
1076    prev_char.is_none_or(|c| c.is_whitespace() || is_unicode_punct_or_symbol(c))
1077}
1078
1079fn is_right_flanking(text: &str, run_start: usize, run_len: usize) -> bool {
1080    let after = run_start + run_len;
1081    let next_char = text.get(after..).and_then(|s| s.chars().next());
1082    let prev_char = (run_start > 0)
1083        .then(|| text[..run_start].chars().last())
1084        .flatten();
1085
1086    let preceded_by_ws = prev_char.is_none_or(|c| c.is_whitespace());
1087    if preceded_by_ws {
1088        return false;
1089    }
1090    let preceded_by_punct = prev_char.is_some_and(is_unicode_punct_or_symbol);
1091    if !preceded_by_punct {
1092        return true;
1093    }
1094    next_char.is_none_or(|c| c.is_whitespace() || is_unicode_punct_or_symbol(c))
1095}
1096
1097// ============================================================================
1098// Pass 2: Process emphasis (CommonMark §6.2)
1099// ============================================================================
1100
1101/// Run the CommonMark §6.3 `process_emphasis` algorithm over the IR's
1102/// delim runs. Mutates the IR in place: matched runs gain entries in their
1103/// `matches` vec, unmatched bytes stay implicit (the emission pass treats
1104/// any byte not covered by a match as literal text).
1105///
1106/// The algorithm tracks a per-bucket `openers_bottom` exclusive lower
1107/// bound to keep walk-back bounded; consume rules and the §6.2 mod-3
1108/// rejection match the reference implementation.
1109pub fn process_emphasis(events: &mut [IrEvent], dialect: crate::options::Dialect) {
1110    process_emphasis_in_range(events, 0, events.len(), dialect);
1111}
1112
1113/// Range-scoped variant of [`process_emphasis`].
1114///
1115/// Only delim runs whose IR event index lies in `[lo, hi)` are considered.
1116/// Used by [`build_full_plans`] to run emphasis pairing inside each
1117/// resolved bracket pair *before* the global top-level pass, so emphasis
1118/// can never form across a link's bracket boundary (CommonMark §6.3
1119/// requires bracket resolution to happen first when at a `]`, with
1120/// emphasis processed on the link's inner range).
1121///
1122/// The function additionally skips delim runs that already carry a
1123/// recorded match in their `matches` vec — this lets the second
1124/// (top-level) pass reuse the same algorithm without re-pairing bytes
1125/// already consumed by inner-range passes.
1126pub fn process_emphasis_in_range(
1127    events: &mut [IrEvent],
1128    lo: usize,
1129    hi: usize,
1130    dialect: crate::options::Dialect,
1131) {
1132    process_emphasis_in_range_filtered(events, lo, hi, None, dialect);
1133}
1134
1135/// Internal variant of [`process_emphasis_in_range`] with an optional
1136/// exclusion bitmap. Event indices for which `excluded[i] == true` are
1137/// treated as if their delim run were already fully consumed — used by
1138/// [`build_full_plans`] to keep the top-level emphasis pass from pairing
1139/// across a resolved bracket pair's boundary (the inner delim runs of
1140/// such a pair belong to the link's inner range and were already paired
1141/// by the scoped pass).
1142fn process_emphasis_in_range_filtered(
1143    events: &mut [IrEvent],
1144    lo: usize,
1145    hi: usize,
1146    excluded: Option<&[bool]>,
1147    dialect: crate::options::Dialect,
1148) {
1149    let is_commonmark = dialect == crate::options::Dialect::CommonMark;
1150    if is_commonmark {
1151        run_emphasis_pass(events, lo, hi, excluded, dialect, &[], false);
1152        return;
1153    }
1154    // Pandoc dialect: cascade-then-rerun. Run the standard pass, then
1155    // invalidate Emph/Strong pairs whose inner range contains an
1156    // unmatched same-char run with both can_open && can_close (Pandoc's
1157    // recursive descent would have failed those outer pairs because the
1158    // inner content has a stray, ambiguous delimiter the recursive
1159    // parser cannot pair). The invalidated pairs go into a "rejected
1160    // list" that the next iteration of the standard pass consults to
1161    // pick a different opener for the same closer (or reject the
1162    // closer altogether). Iterate to a fixed point.
1163    //
1164    // The rerun (iter 2+) runs in `strict` mode: a candidate pair is
1165    // rejected if its inner range contains an unmatched same-char run
1166    // with count > pair.count. This mirrors pandoc-markdown's
1167    // recursive-descent semantics where, e.g. inside a failed outer
1168    // `**...**` Strong, the inner `one c` parser's `option2`
1169    // (`string [c,c] >> two c mempty`) greedily consumes a stray `**`
1170    // and prevents subsequent `*` runs from pairing as Emph. Without
1171    // this gate, `**foo *bar** baz*` would produce Emph[bar** baz]
1172    // after the outer Strong invalidation, but pandoc treats it as
1173    // all-literal because the inner `**` blocks the Emph match.
1174    let mut rejected: Vec<(usize, usize)> = Vec::new();
1175    let max_iters = events.len().saturating_add(2);
1176    let mut iter = 0;
1177    loop {
1178        let strict = iter > 0;
1179        run_emphasis_pass(events, lo, hi, excluded, dialect, &rejected, strict);
1180        let invalidations = pandoc_cascade_invalidate(events);
1181        if invalidations.is_empty() {
1182            break;
1183        }
1184        rejected.extend(invalidations);
1185        iter += 1;
1186        if iter >= max_iters {
1187            break;
1188        }
1189    }
1190    // Recovery for `***A **B** C***` patterns: synthesise the inner
1191    // Strong match the standard delim-stack algorithm can't reach.
1192    pandoc_inner_strong_recovery(events);
1193}
1194
1195/// One pass of the CommonMark §6.2 emphasis pairing algorithm over the
1196/// IR's [`DelimRun`](IrEvent::DelimRun) events in `[lo, hi)`. Pandoc
1197/// dialect gates apply when `dialect == Dialect::Pandoc`. The
1198/// `rejected_pairs` list (Pandoc only) excludes specific
1199/// (opener_event_idx, closer_event_idx) pairs from matching — used by
1200/// the cascade-then-rerun loop to prevent invalidated pairs from
1201/// re-forming on the next iteration.
1202fn run_emphasis_pass(
1203    events: &mut [IrEvent],
1204    lo: usize,
1205    hi: usize,
1206    excluded: Option<&[bool]>,
1207    dialect: crate::options::Dialect,
1208    rejected_pairs: &[(usize, usize)],
1209    strict_pandoc: bool,
1210) {
1211    let is_commonmark = dialect == crate::options::Dialect::CommonMark;
1212    let hi = hi.min(events.len());
1213    if lo >= hi {
1214        return;
1215    }
1216    // Indices of DelimRun events within [lo, hi), in order, that have
1217    // not already been fully consumed by an earlier scoped pass and that
1218    // are not in the optional exclusion bitmap.
1219    let mut delim_idxs: Vec<usize> = events[lo..hi]
1220        .iter()
1221        .enumerate()
1222        .filter_map(|(i, e)| {
1223            let abs = lo + i;
1224            match e {
1225                IrEvent::DelimRun { matches, .. }
1226                    if matches.is_empty()
1227                        && excluded.is_none_or(|ex| ex.get(abs).copied() != Some(true)) =>
1228                {
1229                    Some(abs)
1230                }
1231                _ => None,
1232            }
1233        })
1234        .collect();
1235    if delim_idxs.is_empty() {
1236        return;
1237    }
1238
1239    // Working state: count (remaining unmatched chars) and source_start
1240    // (first remaining char) per delim run. Indexed by position in
1241    // `delim_idxs`.
1242    let mut count: Vec<usize> = Vec::with_capacity(delim_idxs.len());
1243    let mut source_start: Vec<usize> = Vec::with_capacity(delim_idxs.len());
1244    let mut removed: Vec<bool> = vec![false; delim_idxs.len()];
1245
1246    for &ev_idx in &delim_idxs {
1247        if let IrEvent::DelimRun { start, end, .. } = &events[ev_idx] {
1248            count.push(end - start);
1249            source_start.push(*start);
1250        }
1251    }
1252
1253    // openers_bottom[ch_idx][len%3][can_open] → exclusive lower bound
1254    // (an index into `delim_idxs`, or None meaning "no bottom yet").
1255    let mut openers_bottom: [[[Option<usize>; 2]; 3]; 2] = [[[None; 2]; 3]; 2];
1256
1257    // First active index, scanning forward.
1258    let first_active =
1259        |removed: &[bool]| -> Option<usize> { (0..removed.len()).find(|&i| !removed[i]) };
1260    let next_active = |removed: &[bool], from: usize| -> Option<usize> {
1261        (from + 1..removed.len()).find(|&i| !removed[i])
1262    };
1263    let prev_active =
1264        |removed: &[bool], from: usize| -> Option<usize> { (0..from).rev().find(|&i| !removed[i]) };
1265
1266    let min_closer_count = 1usize;
1267    let mut closer_local = first_active(&removed);
1268    while let Some(c) = closer_local {
1269        let ev_c_idx = delim_idxs[c];
1270        let (ch_c, can_open_c, can_close_c) = match &events[ev_c_idx] {
1271            IrEvent::DelimRun {
1272                ch,
1273                can_open,
1274                can_close,
1275                ..
1276            } => (*ch, *can_open, *can_close),
1277            _ => unreachable!(),
1278        };
1279        if !can_close_c || removed[c] || count[c] < min_closer_count {
1280            closer_local = next_active(&removed, c);
1281            continue;
1282        }
1283
1284        let ch_idx = if ch_c == b'*' { 0 } else { 1 };
1285        let closer_mod = count[c] % 3;
1286        let closer_open_bucket = can_open_c as usize;
1287        let bottom = openers_bottom[ch_idx][closer_mod][closer_open_bucket];
1288
1289        // Walk back to find a compatible opener.
1290        let mut found_opener: Option<usize> = None;
1291        let mut walk = prev_active(&removed, c);
1292        while let Some(o) = walk {
1293            if Some(o) == bottom {
1294                break;
1295            }
1296            let ev_o_idx = delim_idxs[o];
1297            let (ch_o, can_open_o, can_close_o) = match &events[ev_o_idx] {
1298                IrEvent::DelimRun {
1299                    ch,
1300                    can_open,
1301                    can_close,
1302                    ..
1303                } => (*ch, *can_open, *can_close),
1304                _ => unreachable!(),
1305            };
1306            if !removed[o] && ch_o == ch_c && can_open_o {
1307                let oc_sum = count[o] + count[c];
1308                let opener_both = can_open_o && can_close_o;
1309                let closer_both = can_open_c && can_close_c;
1310                let mod3_reject = is_commonmark
1311                    && (opener_both || closer_both)
1312                    && oc_sum.is_multiple_of(3)
1313                    && !(count[o].is_multiple_of(3) && count[c].is_multiple_of(3));
1314                // Pandoc-markdown rejects emph/strong pairs whose counts
1315                // disagree in the exactly-(1,2) / (2,1) shape:
1316                //   - `**foo*` (2,1): `try_parse_two` looks only for a
1317                //     `**` closer; the lone `*` doesn't satisfy that.
1318                //   - `*foo**` (1,2): `try_parse_one` encountering `**`
1319                //     tries `try_parse_two`; absence of an inner `**`
1320                //     closer cascades the outer parse to fail.
1321                // Other count combinations DO match (verified against
1322                // `pandoc -f markdown`):
1323                //   - (1,3) / (3,1) → emph match, opposite-side
1324                //     leftover `**` literal.
1325                //   - (2,3) / (3,2) → strong match, single `*` literal.
1326                //   - (3,3) → STRONG(EM(...)) nested.
1327                //   - (1..3, 4+) → match (Pandoc's ender walks the
1328                //     closer run for a valid position; algorithm
1329                //     consumes leftmost via leftover-as-literal).
1330                // Opener count >= 4 is rejected (Pandoc's
1331                // `try_parse_emphasis` has no count-4+ dispatch).
1332                let pandoc_reject = !is_commonmark
1333                    && ((count[o] == 1 && count[c] == 2)
1334                        || (count[o] == 2 && count[c] == 1)
1335                        || count[o] >= 4);
1336                let pair_rejected = !is_commonmark && {
1337                    let oe = delim_idxs[o];
1338                    let ce = delim_idxs[c];
1339                    rejected_pairs.iter().any(|&(ro, rc)| ro == oe && rc == ce)
1340                };
1341                // Pandoc strict-rerun gate (iter 2+ only): block a
1342                // candidate pair if any unmatched same-char run between
1343                // its opener and closer has remaining count strictly
1344                // greater than the consume rule for this pair.
1345                // Mirrors pandoc-markdown's recursive descent where
1346                // `one c`'s `option2` (`string [c,c] >> two c`) would
1347                // greedily consume a stray higher-count run, blocking
1348                // the outer `one c` from finding its `ender c 1` —
1349                // e.g. `**foo *bar** baz*` after the outer Strong
1350                // invalidates: a naïve rerun pairs ev1 (`*`) ↔ ev3
1351                // (`*`) as Emph (consume=1), but pandoc treats the
1352                // `**` between as having "consumed" any further
1353                // matching, leaving everything literal.
1354                let strict_block = strict_pandoc && {
1355                    let tentative_consume = if !is_commonmark && count[o] >= 3 && count[c] >= 3 {
1356                        1
1357                    } else if count[o] >= 2 && count[c] >= 2 {
1358                        2
1359                    } else {
1360                        1
1361                    };
1362                    let lo_evt = delim_idxs[o] + 1;
1363                    let hi_evt = delim_idxs[c];
1364                    (lo_evt..hi_evt).any(|k| match &events[k] {
1365                        IrEvent::DelimRun {
1366                            ch: ch_k,
1367                            start,
1368                            end,
1369                            matches,
1370                            ..
1371                        } => {
1372                            *ch_k == ch_c && {
1373                                let total = end - start;
1374                                let consumed: usize = matches.iter().map(|m| m.len as usize).sum();
1375                                total.saturating_sub(consumed) > tentative_consume
1376                            }
1377                        }
1378                        _ => false,
1379                    })
1380                };
1381                if !mod3_reject && !pandoc_reject && !pair_rejected && !strict_block {
1382                    found_opener = Some(o);
1383                    break;
1384                }
1385            }
1386            if o == 0 {
1387                break;
1388            }
1389            walk = prev_active(&removed, o);
1390        }
1391
1392        if let Some(o) = found_opener {
1393            // Consume rule:
1394            //   CommonMark — consume 2 (Strong) when both sides have
1395            //     >= 2 chars, else 1 (Emph). For `***x***` (3,3) this
1396            //     produces EM(STRONG(...)) because the first match
1397            //     consumes 2 from each side (Strong outermost).
1398            //   Pandoc — when both sides have >= 3, consume 1 first
1399            //     (Emph innermost) leaving 2 + 2 to pair as Strong on
1400            //     the second pass. This produces STRONG(EM(...)) for
1401            //     `***x***`, matching Pandoc-markdown's recursive
1402            //     `try_parse_three` algorithm.
1403            let consume = if !is_commonmark && count[o] >= 3 && count[c] >= 3 {
1404                1
1405            } else if count[o] >= 2 && count[c] >= 2 {
1406                2
1407            } else {
1408                1
1409            };
1410            let kind = if consume == 2 {
1411                EmphasisKind::Strong
1412            } else {
1413                EmphasisKind::Emph
1414            };
1415
1416            // Opener consumes inner-edge (rightmost) chars.
1417            let opener_match_offset =
1418                source_start[o] + count[o] - consume - source_start_event(&events[delim_idxs[o]]);
1419            // Closer consumes inner-edge (leftmost) chars.
1420            let closer_match_offset = source_start[c] - source_start_event(&events[delim_idxs[c]]);
1421
1422            // Record match on opener.
1423            if let IrEvent::DelimRun { matches, .. } = &mut events[delim_idxs[o]] {
1424                matches.push(DelimMatch {
1425                    offset_in_run: opener_match_offset as u8,
1426                    len: consume as u8,
1427                    is_opener: true,
1428                    partner_event: delim_idxs[c] as u32,
1429                    partner_offset: closer_match_offset as u8,
1430                    kind,
1431                });
1432            }
1433            // Record match on closer.
1434            if let IrEvent::DelimRun { matches, .. } = &mut events[delim_idxs[c]] {
1435                matches.push(DelimMatch {
1436                    offset_in_run: closer_match_offset as u8,
1437                    len: consume as u8,
1438                    is_opener: false,
1439                    partner_event: delim_idxs[o] as u32,
1440                    partner_offset: opener_match_offset as u8,
1441                    kind,
1442                });
1443            }
1444
1445            count[o] -= consume;
1446            source_start[c] += consume;
1447            count[c] -= consume;
1448
1449            // Remove all openers strictly between o and c.
1450            let mut between = next_active(&removed, o);
1451            while let Some(idx) = between {
1452                if idx == c {
1453                    break;
1454                }
1455                removed[idx] = true;
1456                between = next_active(&removed, idx);
1457            }
1458
1459            if count[o] == 0 {
1460                removed[o] = true;
1461            }
1462            if count[c] == 0 {
1463                removed[c] = true;
1464                closer_local = next_active(&removed, c);
1465            }
1466            // Else re-process the same closer with reduced count.
1467        } else {
1468            openers_bottom[ch_idx][closer_mod][closer_open_bucket] = prev_active(&removed, c);
1469            if !can_open_c {
1470                removed[c] = true;
1471            }
1472            closer_local = next_active(&removed, c);
1473        }
1474    }
1475
1476    // No further mutation needed: matches are recorded; remaining bytes
1477    // stay implicit literal. Pandoc cascade is invoked by the caller
1478    // (`process_emphasis_in_range_filtered`) once per pass so it can
1479    // accumulate invalidations into a rejected-pairs list and re-run.
1480    let _ = (&mut delim_idxs, &mut openers_bottom, min_closer_count);
1481}
1482
1483/// Pandoc-only post-processing pass over [`process_emphasis_in_range_filtered`]
1484/// matches: invalidate any matched delim pair that contains an unmatched
1485/// same-character run between its opener and closer. Returns the list
1486/// of (opener_event_idx, closer_event_idx) pairs that were invalidated
1487/// in this call, so the caller can seed a rejected-pairs list and
1488/// re-run the standard pass — this lets Pandoc re-pair the inner runs
1489/// that the invalidated outer match would have stolen via
1490/// between-removal (e.g. `*foo **bar* baz**` → after the outer
1491/// `ev0..ev2` Emph is invalidated, `ev1..ev3` matches as Strong on the
1492/// next iteration).
1493fn pandoc_cascade_invalidate(events: &mut [IrEvent]) -> Vec<(usize, usize)> {
1494    let mut invalidated_pairs: Vec<(usize, usize)> = Vec::new();
1495    // Early-exit: if there are no `DelimRun` events at all, the cascade
1496    // pass is a no-op. Avoids allocating the two scratch vecs below for
1497    // every range with no `*`/`_` runs (which is the common case for
1498    // ranges that contain only standalone constructs / brackets).
1499    if !events.iter().any(|e| matches!(e, IrEvent::DelimRun { .. })) {
1500        return invalidated_pairs;
1501    }
1502    // Reuse two scratch vecs across the inner loop iterations instead
1503    // of `.collect()` each time. These are tiny per-paragraph
1504    // allocations but the function is called for every Pandoc inline
1505    // emphasis pass and shows up in malloc traffic.
1506    let mut total: Vec<usize> = Vec::with_capacity(events.len());
1507    let mut consumed: Vec<usize> = Vec::with_capacity(events.len());
1508    loop {
1509        total.clear();
1510        consumed.clear();
1511        // Compute total bytes (run length) and consumed bytes (sum of
1512        // match lens) per DelimRun event index.
1513        total.extend(events.iter().map(|e| match e {
1514            IrEvent::DelimRun { start, end, .. } => end - start,
1515            _ => 0,
1516        }));
1517        consumed.extend(events.iter().map(|e| match e {
1518            IrEvent::DelimRun { matches, .. } => matches.iter().map(|m| m.len as usize).sum(),
1519            _ => 0,
1520        }));
1521
1522        // Find a pair to invalidate. We invalidate one and restart so
1523        // the cascade can re-evaluate dependent pairs.
1524        let mut to_invalidate: Option<(usize, u8)> = None;
1525        'outer: for opener_idx in 0..events.len() {
1526            let IrEvent::DelimRun {
1527                ch: ch_o, matches, ..
1528            } = &events[opener_idx]
1529            else {
1530                continue;
1531            };
1532            for (mi, m) in matches.iter().enumerate() {
1533                if !m.is_opener {
1534                    continue;
1535                }
1536                let closer_idx = m.partner_event as usize;
1537                if closer_idx <= opener_idx || closer_idx >= events.len() {
1538                    continue;
1539                }
1540                // Scan events strictly between opener and closer for any
1541                // DelimRun with the same `ch`, unmatched bytes, AND
1542                // both `can_open` and `can_close` (i.e., the run could
1543                // have participated in pairing on both sides). A
1544                // can_open-only or can_close-only run is a one-sided
1545                // fragment (e.g. an isolated `*` after a backslash
1546                // escape) that the Pandoc recursive-descent path would
1547                // never have tried as a nested-strong opener — those
1548                // shouldn't cascade-invalidate the surrounding pair.
1549                for k in (opener_idx + 1)..closer_idx {
1550                    if let IrEvent::DelimRun {
1551                        ch: ch_k,
1552                        can_open: co_k,
1553                        can_close: cc_k,
1554                        ..
1555                    } = &events[k]
1556                        && *ch_k == *ch_o
1557                        && consumed[k] < total[k]
1558                        && *co_k
1559                        && *cc_k
1560                    {
1561                        to_invalidate = Some((opener_idx, mi as u8));
1562                        break 'outer;
1563                    }
1564                }
1565            }
1566        }
1567
1568        let Some((opener_idx, mi)) = to_invalidate else {
1569            break;
1570        };
1571
1572        // Look up the partner event/offset before mutating.
1573        let (closer_idx, opener_offset) = match &events[opener_idx] {
1574            IrEvent::DelimRun { matches, .. } => {
1575                let m = matches[mi as usize];
1576                (m.partner_event as usize, m.offset_in_run)
1577            }
1578            _ => break,
1579        };
1580
1581        // Remove the opener match.
1582        if let IrEvent::DelimRun { matches, .. } = &mut events[opener_idx] {
1583            matches.remove(mi as usize);
1584        }
1585        // Remove the corresponding closer match (closer's match has
1586        // is_opener=false and partner_offset == opener's offset_in_run).
1587        if let IrEvent::DelimRun { matches, .. } = &mut events[closer_idx] {
1588            matches.retain(|m| m.is_opener || m.partner_offset != opener_offset);
1589        }
1590        invalidated_pairs.push((opener_idx, closer_idx));
1591    }
1592    invalidated_pairs
1593}
1594
1595/// Pandoc-only post-pass: recover the inner Strong match in
1596/// `***A **B** C***` patterns where the IR's standard pass produced
1597/// `Emph[Strong[A], "B**...** C"]` (matching the outer triple as
1598/// Strong+Emph but losing the inner `**...**`-as-Strong-of-`C` pair).
1599///
1600/// Pandoc's recursive descent here goes
1601/// `three c → ender c 2 → one c → option2 → two c`, producing
1602/// `Emph[Strong[A], "B", Strong[C]]` — two Strong nodes inside an outer
1603/// Emph. The standard delim-stack algorithm can't reach this pairing
1604/// because between-removal during the outer Emph match removes the
1605/// inner closer-side `**` (e.g. `bar**`) from the candidate pool.
1606///
1607/// This recovery scans Emph matches whose opener and closer originally
1608/// had count >= 3, and whose closer has unmatched bytes >= 2 after the
1609/// standard pass; for each, we look for an unmatched same-char
1610/// between-run with count >= 2 and `can_close = true` (the would-be
1611/// inner-Strong opener) and synthesise a Strong match that consumes
1612/// the leftmost 2 bytes of the closer (where the existing Emph match
1613/// shifts to the rightmost 1 byte). The byte-position rewrite lets
1614/// the CST emission produce well-nested `Emph[..., Strong[...]]` —
1615/// outer Emph close at the rightmost outer-triple byte, inner Strong
1616/// close at the leftmost two.
1617fn pandoc_inner_strong_recovery(events: &mut [IrEvent]) {
1618    let n = events.len();
1619    // (between_idx, opener_idx, closer_idx, len)
1620    let mut to_apply: Vec<(usize, usize, usize, u8)> = Vec::new();
1621
1622    for opener_idx in 0..n {
1623        let (open_total, open_matches_clone, ch_o) = match &events[opener_idx] {
1624            IrEvent::DelimRun {
1625                start,
1626                end,
1627                matches,
1628                ch,
1629                ..
1630            } => (*end - *start, matches.clone(), *ch),
1631            _ => continue,
1632        };
1633        if open_total < 3 {
1634            continue;
1635        }
1636
1637        for m in open_matches_clone.iter() {
1638            if !m.is_opener || m.kind != EmphasisKind::Emph {
1639                continue;
1640            }
1641            let closer_idx = m.partner_event as usize;
1642            if closer_idx <= opener_idx || closer_idx >= n {
1643                continue;
1644            }
1645
1646            let (close_total, close_consumed) = match &events[closer_idx] {
1647                IrEvent::DelimRun {
1648                    start,
1649                    end,
1650                    matches,
1651                    ..
1652                } => {
1653                    let total = end - start;
1654                    let consumed: usize = matches.iter().map(|m| m.len as usize).sum();
1655                    (total, consumed)
1656                }
1657                _ => continue,
1658            };
1659            if close_total < 3 {
1660                continue;
1661            }
1662            let leftover = close_total.saturating_sub(close_consumed);
1663            if leftover < 2 {
1664                continue;
1665            }
1666
1667            // Walk backward from closer-1 looking for the rightmost
1668            // unmatched same-char run with count >= 2 and
1669            // can_close=true.
1670            for k in ((opener_idx + 1)..closer_idx).rev() {
1671                if let IrEvent::DelimRun {
1672                    ch,
1673                    start,
1674                    end,
1675                    matches,
1676                    can_close,
1677                    ..
1678                } = &events[k]
1679                {
1680                    if *ch != ch_o || !*can_close {
1681                        continue;
1682                    }
1683                    let total = end - start;
1684                    let consumed: usize = matches.iter().map(|m| m.len as usize).sum();
1685                    let remaining = total.saturating_sub(consumed);
1686                    if remaining < 2 {
1687                        continue;
1688                    }
1689                    to_apply.push((k, opener_idx, closer_idx, 2));
1690                    break;
1691                }
1692            }
1693        }
1694    }
1695
1696    for (between_idx, opener_idx, closer_idx, len) in to_apply {
1697        // Find the existing Emph match on the closer side.
1698        let (closer_emph_match_idx, closer_emph_offset) = {
1699            let mut found: Option<(usize, u8)> = None;
1700            if let IrEvent::DelimRun { matches, .. } = &events[closer_idx] {
1701                for (mi, m) in matches.iter().enumerate() {
1702                    if !m.is_opener
1703                        && m.partner_event as usize == opener_idx
1704                        && m.kind == EmphasisKind::Emph
1705                    {
1706                        found = Some((mi, m.offset_in_run));
1707                        break;
1708                    }
1709                }
1710            }
1711            match found {
1712                Some(x) => x,
1713                None => continue,
1714            }
1715        };
1716
1717        // Find the corresponding Emph match on the opener side.
1718        let opener_emph_match_idx = {
1719            let mut found: Option<usize> = None;
1720            if let IrEvent::DelimRun { matches, .. } = &events[opener_idx] {
1721                for (mi, m) in matches.iter().enumerate() {
1722                    if m.is_opener
1723                        && m.partner_event as usize == closer_idx
1724                        && m.kind == EmphasisKind::Emph
1725                    {
1726                        found = Some(mi);
1727                        break;
1728                    }
1729                }
1730            }
1731            match found {
1732                Some(x) => x,
1733                None => continue,
1734            }
1735        };
1736
1737        // Shift the Emph closer's offset to the right of the new
1738        // Strong closer's bytes (Strong takes leftmost `len` bytes,
1739        // Emph takes the next byte).
1740        let new_closer_emph_offset = closer_emph_offset + len;
1741
1742        // Update closer's Emph offset_in_run.
1743        if let IrEvent::DelimRun { matches, .. } = &mut events[closer_idx] {
1744            matches[closer_emph_match_idx].offset_in_run = new_closer_emph_offset;
1745        }
1746        // Update opener's Emph partner_offset to point at the shifted
1747        // Emph closer position.
1748        if let IrEvent::DelimRun { matches, .. } = &mut events[opener_idx] {
1749            matches[opener_emph_match_idx].partner_offset = new_closer_emph_offset;
1750        }
1751
1752        // Add Strong opener match on the between-run.
1753        if let IrEvent::DelimRun { matches, .. } = &mut events[between_idx] {
1754            matches.push(DelimMatch {
1755                offset_in_run: 0,
1756                len,
1757                is_opener: true,
1758                partner_event: closer_idx as u32,
1759                partner_offset: closer_emph_offset,
1760                kind: EmphasisKind::Strong,
1761            });
1762        }
1763        // Add Strong closer match on the closer (at the original
1764        // pre-shift Emph-closer position; the bytes that were the
1765        // single Emph closer now become the leftmost 2 bytes of the
1766        // Strong closer).
1767        if let IrEvent::DelimRun { matches, .. } = &mut events[closer_idx] {
1768            matches.push(DelimMatch {
1769                offset_in_run: closer_emph_offset,
1770                len,
1771                is_opener: false,
1772                partner_event: between_idx as u32,
1773                partner_offset: 0,
1774                kind: EmphasisKind::Strong,
1775            });
1776        }
1777    }
1778}
1779
1780fn source_start_event(event: &IrEvent) -> usize {
1781    match event {
1782        IrEvent::DelimRun { start, .. } => *start,
1783        _ => unreachable!("source_start_event called on non-DelimRun"),
1784    }
1785}
1786
1787// ============================================================================
1788// Pass 3: Process brackets (CommonMark §6.3)
1789// ============================================================================
1790
1791/// Resolve `[`/`![`/`]` markers into link/image nodes per CommonMark §6.3
1792/// (with Pandoc-aware variations under `Dialect::Pandoc`).
1793///
1794/// Walks the IR forward looking for `]` markers. For each one, finds the
1795/// nearest active matching `[`/`![` and tries to resolve the bracket pair
1796/// as a link or image. Resolution is tried in spec order:
1797///
1798/// 1. Inline link / image: `[text](dest)` or `[text](dest "title")`.
1799/// 2. Full reference: `[text][label]`, where `label` is in `refdefs`.
1800/// 3. Collapsed reference: `[text][]`, where `text` (normalised) is in
1801///    `refdefs`.
1802/// 4. Shortcut reference: `[text]` not followed by `(` or `[`, where
1803///    `text` (normalised) is in `refdefs`.
1804///
1805/// On a match, the opener gets a `BracketResolution` and the closer is
1806/// flagged `matched`. Under `Dialect::CommonMark`, all earlier active link
1807/// openers are deactivated to implement the §6.3 "links may not contain
1808/// other links" rule (image brackets do not deactivate earlier link
1809/// openers — only links do). Under `Dialect::Pandoc`, the deactivate-pass
1810/// is skipped: pandoc-native is outer-wins for nested links (the inner
1811/// `[inner](u2)` of `[link [inner](u2)](u1)` is literal text inside the
1812/// outer link), and the dispatcher enforces this via a `suppress_inner_links`
1813/// flag during LINK-text recursion. So under Pandoc the IR can leave both
1814/// outer and inner resolved and trust the dispatcher to suppress inner
1815/// LINK emission.
1816///
1817/// On a miss the bracket pair stays opaque-as-literal and the closer is
1818/// dropped from the bracket stack so the next `]` can re-pair.
1819///
1820/// Reference-form resolution under `Dialect::Pandoc` is shape-only: any
1821/// non-empty link text or label resolves regardless of refdef presence,
1822/// matching the historical legacy `reference_resolves`-returns-`true`
1823/// behavior. (Pandoc emits LINK nodes for unresolved shortcut/collapsed/
1824/// full-reference shapes so downstream features — linter, LSP, formatter
1825/// — have a typed wrapper to walk. Refdef-aware resolution under Pandoc
1826/// is bug #1/#2 territory and is a parser-linter-LSP cross-cut deferred
1827/// to a future workstream.)
1828pub fn process_brackets(
1829    events: &mut [IrEvent],
1830    text: &str,
1831    refdefs: Option<&RefdefMap>,
1832    dialect: crate::options::Dialect,
1833) {
1834    let empty: HashSet<String> = HashSet::new();
1835    let labels: &HashSet<String> = match refdefs {
1836        Some(map) => map.as_ref(),
1837        None => &empty,
1838    };
1839    let is_commonmark = dialect == crate::options::Dialect::CommonMark;
1840    // Under Pandoc, any non-empty reference label resolves shape-only —
1841    // matches the legacy `reference_resolves` short-circuit. Under
1842    // CommonMark, the refdef map is consulted.
1843    let label_resolves = |key_norm: &str| -> bool {
1844        !key_norm.is_empty() && (!is_commonmark || labels.contains(key_norm))
1845    };
1846
1847    // Walk forward through events, treating it as a linear scan for `]`.
1848    let mut i = 0;
1849    while i < events.len() {
1850        let close_pos = match &events[i] {
1851            IrEvent::CloseBracket { pos, .. } => *pos,
1852            _ => {
1853                i += 1;
1854                continue;
1855            }
1856        };
1857
1858        // Find the nearest active OpenBracket before `i`.
1859        let mut o = match find_active_opener(events, i) {
1860            Some(o) => o,
1861            None => {
1862                i += 1;
1863                continue;
1864            }
1865        };
1866
1867        let (open_end, is_image) = match &events[o] {
1868            IrEvent::OpenBracket { end, is_image, .. } => (*end, *is_image),
1869            _ => unreachable!(),
1870        };
1871        let text_start = open_end;
1872        let text_end = close_pos;
1873        let after_close = close_pos + 1;
1874
1875        // 1. Inline link / image.
1876        if let Some((suffix_end, dest, title)) = try_inline_suffix(text, after_close) {
1877            // §6.3 link-in-link rule (CommonMark): if this is a *link*
1878            // (not an image), and any earlier active link opener exists,
1879            // deactivate them. We also deactivate openers strictly before
1880            // `o` here because matching means the inner link wins; the
1881            // spec applies this *after* matching. Pandoc skips this —
1882            // outer-wins is enforced by the dispatcher's
1883            // `suppress_inner_links` flag during LINK-text recursion.
1884            if !is_image && is_commonmark {
1885                deactivate_earlier_link_openers(events, o);
1886            }
1887            commit_resolution(
1888                events,
1889                o,
1890                i,
1891                text_start,
1892                text_end,
1893                after_close,
1894                suffix_end,
1895                LinkKind::Inline { dest, title },
1896            );
1897            // Remove the opener from the bracket stack: it has been
1898            // matched (active=false will fall out automatically since
1899            // resolution is Some).
1900            mark_opener_resolved(events, o);
1901            i += 1;
1902            continue;
1903        }
1904
1905        // 2. Full reference link: `[text][label]`.
1906        let full_ref_suffix = try_full_reference_suffix(text, after_close);
1907        if let Some((suffix_end, label_raw)) = &full_ref_suffix {
1908            let label_norm = normalize_label(label_raw);
1909            if label_resolves(&label_norm) {
1910                if !is_image && is_commonmark {
1911                    deactivate_earlier_link_openers(events, o);
1912                }
1913                commit_resolution(
1914                    events,
1915                    o,
1916                    i,
1917                    text_start,
1918                    text_end,
1919                    after_close,
1920                    *suffix_end,
1921                    LinkKind::FullReference {
1922                        label: label_raw.clone(),
1923                    },
1924                );
1925                mark_opener_resolved(events, o);
1926                i += 1;
1927                continue;
1928            }
1929            // Bracketed but unresolved label: §6.3 says we still treat
1930            // `[text][label]` as not-a-link, but the brackets get
1931            // consumed as literal text AND the shortcut form is
1932            // suppressed (since the `]` is followed by a link label).
1933        }
1934
1935        // 3. Collapsed `[]`.
1936        let link_text = &text[text_start..text_end];
1937        let link_text_norm = normalize_label(link_text);
1938        let is_collapsed = is_collapsed_marker(text, after_close);
1939        let collapsed_suffix_end = after_close + 2;
1940
1941        if is_collapsed && label_resolves(&link_text_norm) {
1942            if !is_image && is_commonmark {
1943                deactivate_earlier_link_openers(events, o);
1944            }
1945            commit_resolution(
1946                events,
1947                o,
1948                i,
1949                text_start,
1950                text_end,
1951                after_close,
1952                collapsed_suffix_end,
1953                LinkKind::CollapsedReference,
1954            );
1955            mark_opener_resolved(events, o);
1956            i += 1;
1957            continue;
1958        }
1959        // `[text][]` with text not in refdefs — falls through to
1960        // literal text; shortcut is suppressed (followed by `[]`).
1961
1962        // 4. Shortcut form: `[text]` not followed by `[]` or `[label]`.
1963        // Per CommonMark §6.3: "A shortcut reference link consists of a
1964        // link label that matches a link reference definition elsewhere
1965        // in the document and is not followed by [] or a link label."
1966        // The full-ref / collapsed shape attempts above suppress the
1967        // shortcut even when their labels don't resolve — the bracket
1968        // bytes still get consumed as literal text.
1969        let shortcut_suppressed = full_ref_suffix.is_some() || is_collapsed;
1970        if !shortcut_suppressed && label_resolves(&link_text_norm) {
1971            if !is_image && is_commonmark {
1972                deactivate_earlier_link_openers(events, o);
1973            }
1974            commit_resolution(
1975                events,
1976                o,
1977                i,
1978                text_start,
1979                text_end,
1980                after_close,
1981                after_close,
1982                LinkKind::ShortcutReference,
1983            );
1984            mark_opener_resolved(events, o);
1985            i += 1;
1986            continue;
1987        }
1988
1989        // No resolution. Drop the opener — its `]` partner is this one,
1990        // but since neither matched, the opener falls through to literal
1991        // text. We do this by deactivating the opener (so it won't be
1992        // considered for later `]` markers either).
1993        if let IrEvent::OpenBracket { active, .. } = &mut events[o] {
1994            *active = false;
1995        }
1996        let _ = &mut o;
1997        i += 1;
1998    }
1999}
2000
2001fn find_active_opener(events: &[IrEvent], close_idx: usize) -> Option<usize> {
2002    (0..close_idx).rev().find(|&i| {
2003        matches!(
2004            &events[i],
2005            IrEvent::OpenBracket {
2006                active: true,
2007                resolution: None,
2008                ..
2009            }
2010        )
2011    })
2012}
2013
2014fn deactivate_earlier_link_openers(events: &mut [IrEvent], open_idx: usize) {
2015    for ev in &mut events[..open_idx] {
2016        if let IrEvent::OpenBracket {
2017            is_image: false,
2018            active,
2019            resolution: None,
2020            ..
2021        } = ev
2022        {
2023            *active = false;
2024        }
2025    }
2026}
2027
2028fn mark_opener_resolved(events: &mut [IrEvent], open_idx: usize) {
2029    if let IrEvent::OpenBracket { active, .. } = &mut events[open_idx] {
2030        *active = false;
2031    }
2032}
2033
2034#[allow(clippy::too_many_arguments)]
2035fn commit_resolution(
2036    events: &mut [IrEvent],
2037    open_idx: usize,
2038    close_idx: usize,
2039    text_start: usize,
2040    text_end: usize,
2041    suffix_start: usize,
2042    suffix_end: usize,
2043    kind: LinkKind,
2044) {
2045    if let IrEvent::OpenBracket { resolution, .. } = &mut events[open_idx] {
2046        *resolution = Some(BracketResolution {
2047            close_event: close_idx as u32,
2048            text_start,
2049            text_end,
2050            suffix_start,
2051            suffix_end,
2052            kind,
2053        });
2054    }
2055    if let IrEvent::CloseBracket { matched, .. } = &mut events[close_idx] {
2056        *matched = true;
2057    }
2058}
2059
2060/// Try to parse `(dest)` or `(dest "title")` inline link suffix starting
2061/// at `text[pos]`. Returns `(end_pos_exclusive, dest, title)`.
2062fn try_inline_suffix(text: &str, pos: usize) -> Option<(usize, String, Option<String>)> {
2063    let bytes = text.as_bytes();
2064    if pos >= bytes.len() || bytes[pos] != b'(' {
2065        return None;
2066    }
2067    let mut p = pos + 1;
2068    // Skip leading whitespace.
2069    while p < bytes.len() && matches!(bytes[p], b' ' | b'\t' | b'\n') {
2070        p += 1;
2071    }
2072    // Empty `()` — link with empty destination.
2073    if p < bytes.len() && bytes[p] == b')' {
2074        return Some((p + 1, String::new(), None));
2075    }
2076
2077    // Parse destination.
2078    let (dest, dest_end) = parse_link_destination(text, p)?;
2079    p = dest_end;
2080
2081    // Skip whitespace.
2082    while p < bytes.len() && matches!(bytes[p], b' ' | b'\t' | b'\n') {
2083        p += 1;
2084    }
2085
2086    // Optional title.
2087    let mut title = None;
2088    if p < bytes.len() && matches!(bytes[p], b'"' | b'\'' | b'(') {
2089        let (t, t_end) = parse_link_title(text, p)?;
2090        title = Some(t);
2091        p = t_end;
2092        while p < bytes.len() && matches!(bytes[p], b' ' | b'\t' | b'\n') {
2093            p += 1;
2094        }
2095    }
2096
2097    if p >= bytes.len() || bytes[p] != b')' {
2098        return None;
2099    }
2100    Some((p + 1, dest, title))
2101}
2102
2103fn parse_link_destination(text: &str, start: usize) -> Option<(String, usize)> {
2104    let bytes = text.as_bytes();
2105    if start >= bytes.len() {
2106        return None;
2107    }
2108    if bytes[start] == b'<' {
2109        // <bracketed>
2110        let mut p = start + 1;
2111        let begin = p;
2112        while p < bytes.len() && bytes[p] != b'>' && bytes[p] != b'\n' && bytes[p] != b'<' {
2113            if bytes[p] == b'\\' && p + 1 < bytes.len() {
2114                p += 2;
2115            } else {
2116                p += 1;
2117            }
2118        }
2119        if p >= bytes.len() || bytes[p] != b'>' {
2120            return None;
2121        }
2122        let dest = text[begin..p].to_string();
2123        Some((dest, p + 1))
2124    } else {
2125        // unbracketed: balanced parens, no spaces, no controls
2126        let mut p = start;
2127        let mut paren_depth: i32 = 0;
2128        while p < bytes.len() {
2129            let b = bytes[p];
2130            if b == b'\\' && p + 1 < bytes.len() {
2131                p += 2;
2132                continue;
2133            }
2134            if b == b'(' {
2135                paren_depth += 1;
2136                p += 1;
2137                continue;
2138            }
2139            if b == b')' {
2140                if paren_depth == 0 {
2141                    break;
2142                }
2143                paren_depth -= 1;
2144                p += 1;
2145                continue;
2146            }
2147            if b == b' ' || b == b'\t' || b == b'\n' || b < 0x20 || b == 0x7f {
2148                break;
2149            }
2150            p += 1;
2151        }
2152        if p == start || paren_depth != 0 {
2153            return None;
2154        }
2155        Some((text[start..p].to_string(), p))
2156    }
2157}
2158
2159fn parse_link_title(text: &str, start: usize) -> Option<(String, usize)> {
2160    let bytes = text.as_bytes();
2161    let q = bytes[start];
2162    let close = match q {
2163        b'"' => b'"',
2164        b'\'' => b'\'',
2165        b'(' => b')',
2166        _ => return None,
2167    };
2168    let mut p = start + 1;
2169    let begin = p;
2170    while p < bytes.len() {
2171        let b = bytes[p];
2172        if b == b'\\' && p + 1 < bytes.len() {
2173            p += 2;
2174            continue;
2175        }
2176        if b == close {
2177            let title = text[begin..p].to_string();
2178            return Some((title, p + 1));
2179        }
2180        p += 1;
2181    }
2182    None
2183}
2184
2185/// Try to parse `[label]` after a `]`. Returns `(suffix_end, label_raw)`.
2186/// For the collapsed form `[]`, returns `None` here (handled separately
2187/// by `is_collapsed_marker`).
2188fn try_full_reference_suffix(text: &str, pos: usize) -> Option<(usize, String)> {
2189    let bytes = text.as_bytes();
2190    if pos >= bytes.len() || bytes[pos] != b'[' {
2191        return None;
2192    }
2193    let label_start = pos + 1;
2194    let mut p = label_start;
2195    let mut escape_next = false;
2196    while p < bytes.len() {
2197        if escape_next {
2198            escape_next = false;
2199            p += 1;
2200            continue;
2201        }
2202        match bytes[p] {
2203            b'\\' => {
2204                escape_next = true;
2205                p += 1;
2206            }
2207            b']' => break,
2208            b'[' => return None,
2209            b'\n' => {
2210                p += 1;
2211            }
2212            _ => p += 1,
2213        }
2214    }
2215    if p >= bytes.len() || bytes[p] != b']' {
2216        return None;
2217    }
2218    let label = text[label_start..p].to_string();
2219    if label.is_empty() {
2220        return None;
2221    }
2222    Some((p + 1, label))
2223}
2224
2225fn is_collapsed_marker(text: &str, pos: usize) -> bool {
2226    text.as_bytes().get(pos) == Some(&b'[') && text.as_bytes().get(pos + 1) == Some(&b']')
2227}
2228
2229// ============================================================================
2230// Bracket plan — byte-position-keyed view of resolved brackets, consumed by
2231// the existing emission walk in `core::parse_inline_range_impl`.
2232// ============================================================================
2233
2234/// Disposition of a single bracket byte after [`process_brackets`].
2235#[derive(Debug, Clone)]
2236pub enum BracketDispo {
2237    /// `[` or `![` of a resolved link/image. Emission emits the LINK/IMAGE
2238    /// node and skips past `suffix_end`.
2239    Open {
2240        is_image: bool,
2241        text_start: usize,
2242        text_end: usize,
2243        suffix_start: usize,
2244        suffix_end: usize,
2245        kind: LinkKind,
2246    },
2247    /// Bracket byte (one of `[`, `]`, or `!`) that fell through to literal
2248    /// text. Emission accumulates into the surrounding text run.
2249    Literal,
2250}
2251
2252/// A byte-keyed view of the IR's bracket resolutions.
2253#[derive(Debug, Default, Clone)]
2254pub struct BracketPlan {
2255    by_pos: BTreeMap<usize, BracketDispo>,
2256}
2257
2258impl BracketPlan {
2259    pub fn lookup(&self, pos: usize) -> Option<&BracketDispo> {
2260        self.by_pos.get(&pos)
2261    }
2262
2263    pub fn is_empty(&self) -> bool {
2264        self.by_pos.is_empty()
2265    }
2266}
2267
2268/// A standalone Pandoc inline construct recognised by `build_ir` and
2269/// dispatched directly from the emission walk. Carries the construct's
2270/// full source range so the emission walk can slice the content for the
2271/// existing `emit_*` helpers without re-running the recognition.
2272#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2273pub enum ConstructDispo {
2274    /// `^[note text]` — emit via `emit_inline_footnote` after slicing
2275    /// the inner content.
2276    InlineFootnote { end: usize },
2277    /// `<span ...>...</span>` — emit via `emit_native_span` after
2278    /// re-parsing the open-tag attributes from the source range.
2279    NativeSpan { end: usize },
2280    /// `[^id]` — emit via `emit_footnote_reference` after extracting
2281    /// the label id from the source range.
2282    FootnoteReference { end: usize },
2283    /// `[@cite]` — emit via `emit_bracketed_citation` after slicing
2284    /// the inner content.
2285    BracketedCitation { end: usize },
2286    /// `@key` or `-@key` — emit via `emit_bare_citation` (or
2287    /// `emit_crossref` when `is_quarto_crossref_key` matches and
2288    /// `extensions.quarto_crossrefs` is enabled).
2289    BareCitation { end: usize },
2290    /// `[content]{attrs}` — emit via `emit_bracketed_span` after
2291    /// slicing the inner content and attribute string.
2292    BracketedSpan { end: usize },
2293}
2294
2295/// A byte-keyed view of the IR's standalone Pandoc constructs that the
2296/// emission walk consumes directly: inline footnotes, native spans,
2297/// footnote references, bracketed citations, bare citations, and
2298/// bracketed spans. Recognition is authoritative in `build_ir` under
2299/// `Dialect::Pandoc`; the dispatcher's legacy branches for these
2300/// constructs (`^[`, `<span>`, `[^id]`, `[@cite]`, `@cite` / `-@cite`,
2301/// `[text]{attrs}`) are gated to `Dialect::CommonMark` only and only
2302/// fire when the relevant extension is explicitly enabled.
2303#[derive(Debug, Default, Clone)]
2304pub struct ConstructPlan {
2305    by_pos: BTreeMap<usize, ConstructDispo>,
2306}
2307
2308impl ConstructPlan {
2309    pub fn lookup(&self, pos: usize) -> Option<&ConstructDispo> {
2310        self.by_pos.get(&pos)
2311    }
2312
2313    pub fn is_empty(&self) -> bool {
2314        self.by_pos.is_empty()
2315    }
2316}
2317
2318/// Build a [`ConstructPlan`] from the resolved IR. Each
2319/// `Construct { kind: InlineFootnote | NativeSpan, .. }` becomes one
2320/// entry keyed at its start byte.
2321pub fn build_construct_plan(events: &[IrEvent]) -> ConstructPlan {
2322    let mut by_pos: BTreeMap<usize, ConstructDispo> = BTreeMap::new();
2323    for ev in events {
2324        if let IrEvent::Construct { start, end, kind } = ev {
2325            match kind {
2326                ConstructKind::InlineFootnote => {
2327                    by_pos.insert(*start, ConstructDispo::InlineFootnote { end: *end });
2328                }
2329                ConstructKind::NativeSpan => {
2330                    by_pos.insert(*start, ConstructDispo::NativeSpan { end: *end });
2331                }
2332                ConstructKind::FootnoteReference => {
2333                    by_pos.insert(*start, ConstructDispo::FootnoteReference { end: *end });
2334                }
2335                ConstructKind::BracketedCitation => {
2336                    by_pos.insert(*start, ConstructDispo::BracketedCitation { end: *end });
2337                }
2338                ConstructKind::BareCitation => {
2339                    by_pos.insert(*start, ConstructDispo::BareCitation { end: *end });
2340                }
2341                ConstructKind::BracketedSpan => {
2342                    by_pos.insert(*start, ConstructDispo::BracketedSpan { end: *end });
2343                }
2344                _ => {}
2345            }
2346        }
2347    }
2348    ConstructPlan { by_pos }
2349}
2350
2351/// Build a [`BracketPlan`] from the resolved IR. Each `OpenBracket`
2352/// resolution becomes an [`BracketDispo::Open`] keyed at the opener's
2353/// start byte. Unresolved openers and unmatched closers become
2354/// `BracketDispo::Literal` so the emission path can recognise them
2355/// without re-parsing.
2356pub fn build_bracket_plan(events: &[IrEvent]) -> BracketPlan {
2357    let mut by_pos: BTreeMap<usize, BracketDispo> = BTreeMap::new();
2358    for ev in events {
2359        match ev {
2360            IrEvent::OpenBracket {
2361                start,
2362                is_image,
2363                resolution: Some(res),
2364                ..
2365            } => {
2366                by_pos.insert(
2367                    *start,
2368                    BracketDispo::Open {
2369                        is_image: *is_image,
2370                        text_start: res.text_start,
2371                        text_end: res.text_end,
2372                        suffix_start: res.suffix_start,
2373                        suffix_end: res.suffix_end,
2374                        kind: res.kind.clone(),
2375                    },
2376                );
2377            }
2378            IrEvent::OpenBracket {
2379                start,
2380                is_image,
2381                resolution: None,
2382                ..
2383            } => {
2384                let len = if *is_image { 2 } else { 1 };
2385                for off in 0..len {
2386                    by_pos.insert(*start + off, BracketDispo::Literal);
2387                }
2388            }
2389            IrEvent::CloseBracket {
2390                pos,
2391                matched: false,
2392            } => {
2393                by_pos.insert(*pos, BracketDispo::Literal);
2394            }
2395            _ => {}
2396        }
2397    }
2398    BracketPlan { by_pos }
2399}
2400
2401/// One-shot helper: build the IR, run all passes, and return the
2402/// bundled [`InlinePlans`] (emphasis dispositions, bracket resolutions,
2403/// and standalone Pandoc constructs) — packaged together so the inline
2404/// emission path can consume them in one go for either dialect.
2405///
2406/// Pass ordering follows the CommonMark §6.3 reference impl: bracket
2407/// resolution runs first, then emphasis is processed *scoped per resolved
2408/// bracket pair's inner event range*, then once more on the residual
2409/// top-level events. This prevents emphasis pairs from forming across a
2410/// link's bracket boundary, which the previous "all-emphasis-then-all-
2411/// brackets" order got wrong (e.g. spec example #473).
2412pub fn build_full_plans(
2413    text: &str,
2414    start: usize,
2415    end: usize,
2416    config: &ParserOptions,
2417) -> InlinePlans {
2418    let mut scratch = ScratchEvents::checkout();
2419    let bundle = scratch.inner.as_mut().unwrap();
2420    bundle.events.clear();
2421    bundle.bracket_pairs.clear();
2422    bundle.excluded.clear();
2423
2424    build_ir_into(text, start, end, config, &mut bundle.events);
2425    // §6.3 bracket resolution runs for both dialects. Under CommonMark
2426    // it enforces refdef-aware shortcut/collapsed/full-ref resolution
2427    // and the §6.3 link-in-link deactivation rule. Under Pandoc it
2428    // performs shape-only resolution (any non-empty label resolves) and
2429    // skips the deactivation pass — pandoc-native is outer-wins for
2430    // nested links and the dispatcher's `suppress_inner_links` flag
2431    // suppresses inner LINK emission during LINK-text recursion.
2432    process_brackets(
2433        &mut bundle.events,
2434        text,
2435        config.refdef_labels.as_ref(),
2436        config.dialect,
2437    );
2438
2439    // Scoped emphasis pass per resolved bracket pair, innermost first.
2440    // We collect (open_idx, close_idx) pairs of resolved brackets and run
2441    // emphasis only over the events strictly between them. Innermost-first
2442    // ordering matters: an outer link wraps emphasis that wraps an inner
2443    // link, and the inner link's inner range must be paired before the
2444    // outer's inner range so the top-level pass sees consistent state.
2445    bundle.bracket_pairs.extend(
2446        bundle
2447            .events
2448            .iter()
2449            .enumerate()
2450            .filter_map(|(i, ev)| match ev {
2451                IrEvent::OpenBracket {
2452                    resolution: Some(res),
2453                    ..
2454                } => Some((i, res.close_event as usize)),
2455                _ => None,
2456            }),
2457    );
2458    // Innermost-first: sort by close_idx ascending, then open_idx descending.
2459    bundle
2460        .bracket_pairs
2461        .sort_by(|a, b| a.1.cmp(&b.1).then(b.0.cmp(&a.0)));
2462    // Iterate pairs by index so we can hold &mut bundle.events while
2463    // reading bundle.bracket_pairs (split borrow on disjoint fields).
2464    for i in 0..bundle.bracket_pairs.len() {
2465        let (open_idx, close_idx) = bundle.bracket_pairs[i];
2466        process_emphasis_in_range(&mut bundle.events, open_idx + 1, close_idx, config.dialect);
2467    }
2468
2469    // Top-level emphasis pass: handles delim runs that fall outside any
2470    // resolved bracket pair.
2471    let len = bundle.events.len();
2472    if bundle.bracket_pairs.is_empty() {
2473        // Fast path: no resolved brackets means no exclusion mask needed —
2474        // skip the resize-and-fill pass entirely. Common for prose
2475        // paragraphs without inline links.
2476        process_emphasis_in_range_filtered(&mut bundle.events, 0, len, None, config.dialect);
2477    } else {
2478        // Build exclusion bitmap: any delim run whose event index lies
2479        // inside a resolved bracket pair is excluded from the top-level
2480        // pass. Implements the §6.3 boundary rule: emphasis at the top
2481        // level must not pair across a link's brackets.
2482        bundle.excluded.resize(len, false);
2483        for &(open_idx, close_idx) in &bundle.bracket_pairs {
2484            for slot in bundle
2485                .excluded
2486                .iter_mut()
2487                .take(close_idx)
2488                .skip(open_idx + 1)
2489            {
2490                *slot = true;
2491            }
2492        }
2493        process_emphasis_in_range_filtered(
2494            &mut bundle.events,
2495            0,
2496            len,
2497            Some(&bundle.excluded),
2498            config.dialect,
2499        );
2500    }
2501
2502    InlinePlans {
2503        emphasis: build_emphasis_plan(&bundle.events),
2504        brackets: build_bracket_plan(&bundle.events),
2505        constructs: build_construct_plan(&bundle.events),
2506    }
2507}
2508
2509/// Thread-local pool of scratch buffers used by [`build_full_plans`].
2510///
2511/// `build_full_plans` checks out one bundle for the duration of the call
2512/// and returns it on drop so the next call (or a recursive nested call
2513/// from an inline emitter) reuses the allocations. The pool is
2514/// per-thread — the parser is single-threaded — and bounded so a
2515/// long-running editor session can't accumulate stale capacity.
2516struct ScratchEvents {
2517    inner: Option<ScratchBundle>,
2518}
2519
2520#[derive(Default)]
2521struct ScratchBundle {
2522    events: Vec<IrEvent>,
2523    bracket_pairs: Vec<(usize, usize)>,
2524    excluded: Vec<bool>,
2525}
2526
2527thread_local! {
2528    static IR_EVENT_POOL: std::cell::RefCell<Vec<ScratchBundle>> =
2529        const { std::cell::RefCell::new(Vec::new()) };
2530}
2531
2532impl ScratchEvents {
2533    fn checkout() -> Self {
2534        let bundle = IR_EVENT_POOL
2535            .with(|p| p.borrow_mut().pop())
2536            .unwrap_or_default();
2537        Self {
2538            inner: Some(bundle),
2539        }
2540    }
2541}
2542
2543impl Drop for ScratchEvents {
2544    fn drop(&mut self) {
2545        if let Some(mut bundle) = self.inner.take() {
2546            bundle.events.clear();
2547            bundle.bracket_pairs.clear();
2548            bundle.excluded.clear();
2549            // Cap pool depth at 8 (deepest realistic nested-link recursion)
2550            // and drop any bundle whose `events` grew past 8K (a single
2551            // pathological paragraph shouldn't pin a huge allocation
2552            // forever).
2553            if bundle.events.capacity() <= 8192 {
2554                IR_EVENT_POOL.with(|p| {
2555                    let mut pool = p.borrow_mut();
2556                    if pool.len() < 8 {
2557                        pool.push(bundle);
2558                    }
2559                });
2560            }
2561        }
2562    }
2563}
2564
2565/// Bundle of plans produced by [`build_full_plans`] and consumed by the
2566/// inline emission walk.
2567#[derive(Debug, Default, Clone)]
2568pub struct InlinePlans {
2569    pub emphasis: EmphasisPlan,
2570    pub brackets: BracketPlan,
2571    pub constructs: ConstructPlan,
2572}
2573
2574/// Convert the IR's delim-run match decisions into an [`EmphasisPlan`],
2575/// preserving the byte-keyed disposition shape the existing emission walk
2576/// consumes.
2577///
2578/// Each match on a [`DelimRun`](IrEvent::DelimRun) produces one entry in
2579/// the plan: the opener side records `Open` with the partner's source
2580/// byte and length; the closer side records `Close`. Bytes within a run
2581/// that are *not* covered by any match get a `Literal` entry, which the
2582/// emission walk uses to coalesce unmatched delimiter bytes with
2583/// surrounding plain text.
2584pub fn build_emphasis_plan(events: &[IrEvent]) -> EmphasisPlan {
2585    let mut by_pos: BTreeMap<usize, DelimChar> = BTreeMap::new();
2586    for ev in events {
2587        if let IrEvent::DelimRun {
2588            start,
2589            end,
2590            matches,
2591            ..
2592        } = ev
2593        {
2594            for m in matches {
2595                let pos = *start + m.offset_in_run as usize;
2596                let partner_run_start = match &events[m.partner_event as usize] {
2597                    IrEvent::DelimRun { start: ps, .. } => *ps,
2598                    _ => continue,
2599                };
2600                let partner_pos = partner_run_start + m.partner_offset as usize;
2601                if m.is_opener {
2602                    by_pos.insert(
2603                        pos,
2604                        DelimChar::Open {
2605                            len: m.len,
2606                            partner: partner_pos,
2607                            partner_len: m.len,
2608                            kind: m.kind,
2609                        },
2610                    );
2611                } else {
2612                    by_pos.insert(pos, DelimChar::Close);
2613                }
2614            }
2615            // Any remaining bytes (not covered by a match) are literal.
2616            for pos in *start..*end {
2617                by_pos.entry(pos).or_insert(DelimChar::Literal);
2618            }
2619        }
2620    }
2621    EmphasisPlan::from_dispositions(by_pos)
2622}
2623
2624#[cfg(test)]
2625mod tests {
2626    use super::*;
2627    use crate::options::Flavor;
2628    use crate::parser::inlines::inline_ir::DelimChar;
2629    use std::sync::Arc;
2630
2631    fn cm_opts() -> ParserOptions {
2632        let flavor = Flavor::CommonMark;
2633        ParserOptions {
2634            flavor,
2635            dialect: crate::options::Dialect::for_flavor(flavor),
2636            extensions: crate::options::Extensions::for_flavor(flavor),
2637            pandoc_compat: crate::options::PandocCompat::default(),
2638            refdef_labels: None,
2639        }
2640    }
2641
2642    fn refdefs<I: IntoIterator<Item = &'static str>>(labels: I) -> RefdefMap {
2643        Arc::new(labels.into_iter().map(|s| s.to_string()).collect())
2644    }
2645
2646    #[test]
2647    fn ir_event_range_covers_all_variants() {
2648        let txt = IrEvent::Text { start: 0, end: 5 };
2649        assert_eq!(txt.range(), (0, 5));
2650
2651        let close = IrEvent::CloseBracket {
2652            pos: 7,
2653            matched: false,
2654        };
2655        assert_eq!(close.range(), (7, 8));
2656
2657        let open = IrEvent::OpenBracket {
2658            start: 1,
2659            end: 3,
2660            is_image: true,
2661            active: true,
2662            resolution: None,
2663        };
2664        assert_eq!(open.range(), (1, 3));
2665    }
2666
2667    #[test]
2668    fn scan_records_text_and_delim_run() {
2669        let opts = cm_opts();
2670        let ir = build_ir("foo *bar*", 0, 9, &opts);
2671        // Expect: Text "foo ", DelimRun "*", Text "bar", DelimRun "*"
2672        assert!(matches!(ir[0], IrEvent::Text { start: 0, end: 4 }));
2673        assert!(matches!(
2674            ir[1],
2675            IrEvent::DelimRun {
2676                ch: b'*',
2677                start: 4,
2678                end: 5,
2679                ..
2680            }
2681        ));
2682        assert!(matches!(ir[2], IrEvent::Text { start: 5, end: 8 }));
2683        assert!(matches!(
2684            ir[3],
2685            IrEvent::DelimRun {
2686                ch: b'*',
2687                start: 8,
2688                end: 9,
2689                ..
2690            }
2691        ));
2692    }
2693
2694    #[test]
2695    fn scan_records_brackets() {
2696        let opts = cm_opts();
2697        let ir = build_ir("[foo]", 0, 5, &opts);
2698        assert!(matches!(
2699            ir[0],
2700            IrEvent::OpenBracket {
2701                start: 0,
2702                end: 1,
2703                is_image: false,
2704                ..
2705            }
2706        ));
2707        assert!(matches!(ir[1], IrEvent::Text { start: 1, end: 4 }));
2708        assert!(matches!(
2709            ir[2],
2710            IrEvent::CloseBracket {
2711                pos: 4,
2712                matched: false
2713            }
2714        ));
2715    }
2716
2717    #[test]
2718    fn scan_records_image_bracket() {
2719        let opts = cm_opts();
2720        let ir = build_ir("![alt]", 0, 6, &opts);
2721        assert!(matches!(
2722            ir[0],
2723            IrEvent::OpenBracket {
2724                start: 0,
2725                end: 2,
2726                is_image: true,
2727                ..
2728            }
2729        ));
2730    }
2731
2732    #[test]
2733    fn scan_handles_code_span_opacity() {
2734        let opts = cm_opts();
2735        let ir = build_ir("a `*x*` b", 0, 9, &opts);
2736        // Code span `*x*` should be a Construct, NOT delim runs.
2737        let has_delim_run = ir.iter().any(|e| matches!(e, IrEvent::DelimRun { .. }));
2738        assert!(
2739            !has_delim_run,
2740            "code span content should not produce delim runs"
2741        );
2742        assert!(ir.iter().any(|e| matches!(
2743            e,
2744            IrEvent::Construct {
2745                kind: ConstructKind::CodeSpan,
2746                ..
2747            }
2748        )));
2749    }
2750
2751    #[test]
2752    fn process_emphasis_simple_pair() {
2753        let opts = cm_opts();
2754        let mut ir = build_ir("*foo*", 0, 5, &opts);
2755        process_emphasis(&mut ir, opts.dialect);
2756        // First DelimRun (open) gets a match.
2757        let opener = ir
2758            .iter()
2759            .find(|e| matches!(e, IrEvent::DelimRun { start: 0, .. }))
2760            .unwrap();
2761        if let IrEvent::DelimRun { matches, .. } = opener {
2762            assert_eq!(matches.len(), 1);
2763            assert!(matches[0].is_opener);
2764            assert_eq!(matches[0].kind, EmphasisKind::Emph);
2765        }
2766    }
2767
2768    #[test]
2769    fn brackets_resolve_inline_link() {
2770        let opts = cm_opts();
2771        let mut ir = build_ir("[foo](/url)", 0, 11, &opts);
2772        process_brackets(&mut ir, "[foo](/url)", None, opts.dialect);
2773        let open = ir
2774            .iter()
2775            .find(|e| matches!(e, IrEvent::OpenBracket { start: 0, .. }))
2776            .unwrap();
2777        if let IrEvent::OpenBracket { resolution, .. } = open {
2778            let r = resolution.as_ref().expect("inline link resolved");
2779            assert!(matches!(r.kind, LinkKind::Inline { .. }));
2780            if let LinkKind::Inline { dest, .. } = &r.kind {
2781                assert_eq!(dest, "/url");
2782            }
2783        }
2784    }
2785
2786    #[test]
2787    fn brackets_shortcut_resolves_only_with_refdef() {
2788        let opts = cm_opts();
2789        let text = "[foo]";
2790        let map = refdefs(["foo"]);
2791        let mut ir = build_ir(text, 0, text.len(), &opts);
2792        process_brackets(&mut ir, text, Some(&map), opts.dialect);
2793        let open = ir
2794            .iter()
2795            .find(|e| matches!(e, IrEvent::OpenBracket { start: 0, .. }))
2796            .unwrap();
2797        if let IrEvent::OpenBracket { resolution, .. } = open {
2798            assert!(matches!(
2799                resolution.as_ref().unwrap().kind,
2800                LinkKind::ShortcutReference
2801            ));
2802        }
2803    }
2804
2805    #[test]
2806    fn brackets_shortcut_falls_through_without_refdef() {
2807        // CMark example #523 mechanic: `[bar* baz]` is not a refdef, so
2808        // it must NOT resolve as a link — the brackets stay literal so
2809        // the inner `*` becomes available to the outer emphasis scanner.
2810        let opts = cm_opts();
2811        let text = "[bar* baz]";
2812        let mut ir = build_ir(text, 0, text.len(), &opts);
2813        process_brackets(&mut ir, text, None, opts.dialect);
2814        let open = ir
2815            .iter()
2816            .find(|e| matches!(e, IrEvent::OpenBracket { start: 0, .. }))
2817            .unwrap();
2818        if let IrEvent::OpenBracket { resolution, .. } = open {
2819            assert!(resolution.is_none(), "no refdef → bracket stays literal");
2820        }
2821    }
2822
2823    /// Spec #473: `*[bar*](/url)`. The link `[bar*](/url)` resolves; the
2824    /// outer `*...*` MUST NOT pair across the link's bracket boundary,
2825    /// because the inner `*` belongs to the link text.
2826    #[test]
2827    fn full_plans_emphasis_does_not_cross_resolved_link_boundary() {
2828        let opts = cm_opts();
2829        let text = "*[bar*](/url)";
2830        let plans = build_full_plans(text, 0, text.len(), &opts);
2831        // The leading `*` (at byte 0) must NOT be matched as an emphasis
2832        // opener — there's no closer outside the link, and the inner `*`
2833        // (at byte 5) is inside the resolved link's text range so it must
2834        // not be paired with byte 0.
2835        assert!(
2836            matches!(plans.emphasis.lookup(0), Some(DelimChar::Literal) | None),
2837            "outer `*` at byte 0 must not pair across link boundary, got {:?}",
2838            plans.emphasis.lookup(0)
2839        );
2840        // The link `[bar*](/url)` must resolve (opener at byte 1).
2841        assert!(
2842            matches!(plans.brackets.lookup(1), Some(BracketDispo::Open { .. })),
2843            "link [bar*](/url) must resolve at byte 1"
2844        );
2845    }
2846
2847    /// Spec #533: `[foo *bar [baz][ref]*][ref]` with `[ref]: /uri`.
2848    /// Inner `[baz][ref]` resolves as a link; §6.3 link-in-link rule
2849    /// deactivates the outer `[foo ...][ref]` so it falls through to
2850    /// literal brackets. Emphasis `*bar [baz][ref]*` wraps the inner link.
2851    #[test]
2852    fn full_plans_link_in_link_suppression_for_reference_links() {
2853        let opts = cm_opts();
2854        let text = "[foo *bar [baz][ref]*][ref]";
2855        let mut opts_with_refs = opts.clone();
2856        let labels: HashSet<String> = ["ref".to_string()].into_iter().collect();
2857        opts_with_refs.refdef_labels = Some(std::sync::Arc::new(labels));
2858        let plans = build_full_plans(text, 0, text.len(), &opts_with_refs);
2859
2860        // Inner `[baz][ref]` opener is at byte 10 — must resolve.
2861        assert!(
2862            matches!(plans.brackets.lookup(10), Some(BracketDispo::Open { .. })),
2863            "inner [baz][ref] must resolve at byte 10, got {:?}",
2864            plans.brackets.lookup(10)
2865        );
2866        // Outer `[foo ...][ref]` opener is at byte 0 — must NOT resolve
2867        // (link-in-link suppression).
2868        assert!(
2869            matches!(plans.brackets.lookup(0), Some(BracketDispo::Literal) | None),
2870            "outer [foo ...][ref] must fall through to literal at byte 0, got {:?}",
2871            plans.brackets.lookup(0)
2872        );
2873        // Trailing `[ref]` after the outer `]` is at byte 22 — it's a
2874        // standalone shortcut reference and must resolve.
2875        assert!(
2876            matches!(plans.brackets.lookup(22), Some(BracketDispo::Open { .. })),
2877            "trailing [ref] must resolve at byte 22, got {:?}",
2878            plans.brackets.lookup(22)
2879        );
2880        // Emphasis `*...*` at bytes 5 and 20 must pair — the scoped
2881        // emphasis pass over the (deactivated) outer bracket's inner
2882        // event range pairs these.
2883        assert!(
2884            matches!(plans.emphasis.lookup(5), Some(DelimChar::Open { .. })),
2885            "emphasis opener at byte 5 must pair, got {:?}",
2886            plans.emphasis.lookup(5)
2887        );
2888    }
2889}