panache_parser/parser/inlines/inline_ir.rs
1//! Inline IR for both CommonMark and Pandoc dialects.
2//!
3//! The inline parsing pipeline runs in three passes over an intermediate
4//! representation (IR):
5//!
6//! 1. **Scan** ([`build_ir`]): walk the source bytes once, producing a flat
7//! [`Vec<IrEvent>`]. Opaque higher-precedence constructs (escapes, code
8//! spans, autolinks, raw HTML, plus Pandoc math / native spans / inline
9//! footnotes / footnote references / citations / bracketed spans) are
10//! skipped past as a single [`IrEvent::Construct`] event whose source
11//! range is preserved for losslessness. Delimiter runs (`*`/`_`),
12//! bracket markers (`[`, `![`, `]`), soft line breaks, and plain text
13//! spans become distinct events.
14//!
15//! 2. **Process brackets** ([`process_brackets`]) — CommonMark §6.3: the
16//! bracket-stack algorithm walks `]` markers left-to-right. For each
17//! `]`, the algorithm finds the nearest active opener and tries to
18//! resolve the pair as a link or image: inline `[text](dest)`, full
19//! reference `[text][label]`, collapsed `[text][]`, or shortcut
20//! `[text]`. Under CommonMark, reference forms are validated against
21//! the document refdef map and a successful match deactivates all
22//! earlier active openers (§6.3 "links may not contain other links").
23//! Under Pandoc, reference forms resolve shape-only (any non-empty
24//! label) and the deactivation pass is skipped; outer-wins nested-link
25//! semantics are enforced by the emission walk's `suppress_inner_links`
26//! flag instead.
27//!
28//! 3. **Process emphasis** ([`process_emphasis_in_range`]): the classic
29//! delimiter-stack algorithm runs over the [`IrEvent::DelimRun`]
30//! events, pairing openers with closers and recording matches on the
31//! runs. Runs first scoped per resolved bracket pair (innermost
32//! first), then a top-level pass over the residual events. Each match
33//! consumes 1 or 2 inner-edge bytes from each side; leftover bytes
34//! fall through to literal text. Dialect gates (Pandoc flanking rules,
35//! mod-3 rejection, asymmetric (1,2)/(2,1) rejection, opener-count >= 4
36//! rejection, triple-emph nesting flip, cascade-then-rerun) branch on
37//! the `dialect` parameter.
38//!
39//! The emission walk in [`super::core::parse_inline_range_impl`] consumes
40//! three byte-keyed plans built by [`build_full_plans`]: an
41//! [`EmphasisPlan`] for delim-run dispositions, a [`BracketPlan`] for
42//! resolved link/image bracket pairs, and a [`ConstructPlan`] for
43//! standalone Pandoc constructs (inline footnotes, native spans, footnote
44//! references, citations, bracketed spans). Matched delim runs become
45//! `EMPHASIS` / `STRONG` nodes; matched bracket pairs become `LINK` /
46//! `IMAGE` nodes via the dispatcher's `try_parse_*` recognizers (called
47//! to *parse* a matched range, not to *resolve* it). Unmatched delims and
48//! brackets fall through to plain text.
49
50use crate::options::ParserOptions;
51use crate::parser::inlines::refdef_map::{RefdefMap, normalize_label};
52use std::collections::{BTreeMap, HashSet};
53
54#[derive(Debug, Clone, Copy, PartialEq, Eq)]
55pub enum EmphasisKind {
56 Emph,
57 Strong,
58}
59
60/// Disposition of a single delimiter byte after emphasis resolution.
61#[derive(Debug, Clone, Copy)]
62pub enum DelimChar {
63 /// Start of an opening marker. The marker spans `len` bytes from this
64 /// position; the matching closer starts at `partner` and spans
65 /// `partner_len` bytes.
66 Open {
67 len: u8,
68 partner: usize,
69 partner_len: u8,
70 kind: EmphasisKind,
71 },
72 /// Start of a closing marker. The matching opener starts at `partner`.
73 /// Emission jumps past close markers via the matching `Open` entry, so
74 /// this variant is only consulted defensively.
75 Close,
76 /// Unmatched delimiter byte; emit as literal text.
77 Literal,
78}
79
80/// Byte-keyed disposition map for `*` / `_` delimiter chars produced by
81/// the IR's emphasis pass and consumed by the inline emission walk.
82#[derive(Debug, Default, Clone)]
83pub struct EmphasisPlan {
84 by_pos: BTreeMap<usize, DelimChar>,
85}
86
87impl EmphasisPlan {
88 pub fn lookup(&self, pos: usize) -> Option<DelimChar> {
89 self.by_pos.get(&pos).copied()
90 }
91
92 pub fn is_empty(&self) -> bool {
93 self.by_pos.is_empty()
94 }
95
96 /// Construct an `EmphasisPlan` from a byte-keyed disposition map.
97 pub fn from_dispositions(by_pos: BTreeMap<usize, DelimChar>) -> Self {
98 Self { by_pos }
99 }
100}
101
102use super::bracketed_spans::try_parse_bracketed_span;
103use super::citations::{try_parse_bare_citation, try_parse_bracketed_citation};
104use super::code_spans::try_parse_code_span;
105use super::escapes::{EscapeType, try_parse_escape};
106use super::inline_footnotes::{try_parse_footnote_reference, try_parse_inline_footnote};
107use super::inline_html::try_parse_inline_html;
108use super::links::{
109 LinkScanContext, try_parse_autolink, try_parse_inline_image, try_parse_inline_link,
110 try_parse_reference_image, try_parse_reference_link,
111};
112use super::math::{
113 try_parse_display_math, try_parse_double_backslash_display_math,
114 try_parse_double_backslash_inline_math, try_parse_gfm_inline_math, try_parse_inline_math,
115 try_parse_single_backslash_display_math, try_parse_single_backslash_inline_math,
116};
117use super::native_spans::try_parse_native_span;
118
119/// One event in the inline IR.
120///
121/// Events partition the source byte range covered by the IR exactly: their
122/// `range()` values are contiguous and non-overlapping, so concatenating
123/// them reproduces the original input. This is the losslessness invariant
124/// the emission pass relies on.
125#[derive(Debug, Clone)]
126pub enum IrEvent {
127 /// Plain text byte span. Emitted as a single `TEXT` token, possibly
128 /// merged with adjacent literal-disposition delim/bracket bytes.
129 Text { start: usize, end: usize },
130
131 /// An opaque higher-precedence construct (escape, code span, autolink,
132 /// raw HTML). The emission pass re-parses these from the source byte
133 /// range using the existing per-construct emitters; we don't store a
134 /// pre-built `GreenNode` because `rowan::GreenNodeBuilder` doesn't
135 /// support inserting subtrees directly. The byte range is what makes
136 /// emission well-defined — the construct kind is recovered by the
137 /// emitter dispatching on the leading byte.
138 Construct {
139 start: usize,
140 end: usize,
141 kind: ConstructKind,
142 },
143
144 /// A `*` or `_` delimiter run. The `matches` vec is filled in by
145 /// [`process_emphasis`]; before that pass it is empty.
146 DelimRun {
147 ch: u8,
148 start: usize,
149 end: usize,
150 can_open: bool,
151 can_close: bool,
152 /// Matched fragments produced by `process_emphasis`. Each entry
153 /// is one `(byte_offset_within_run, len, partner_event_idx,
154 /// partner_byte_offset, kind, is_opener)` tuple. Empty until the
155 /// pass runs; possibly multiple entries when a single run matches
156 /// at multiple positions (e.g. a 4-run that closes 2+2 pairs).
157 matches: Vec<DelimMatch>,
158 },
159
160 /// `[` or `![` bracket marker. Resolved by [`process_brackets`].
161 OpenBracket {
162 start: usize,
163 /// `start + 1` for `[`, `start + 2` for `![`.
164 end: usize,
165 is_image: bool,
166 /// True until a later resolution rule deactivates this opener.
167 active: bool,
168 /// Filled in when the matching `CloseBracket` resolves the pair
169 /// to a link / image.
170 resolution: Option<BracketResolution>,
171 },
172
173 /// `]` bracket marker. Resolved by [`process_brackets`].
174 CloseBracket {
175 pos: usize,
176 /// True if this `]` was paired with an opener and the pair was
177 /// turned into a link / image.
178 matched: bool,
179 },
180
181 /// A soft line break (a `\n` or `\r\n` ending a paragraph-internal
182 /// line). Includes the line-ending bytes verbatim.
183 SoftBreak { start: usize, end: usize },
184
185 /// A hard line break (` \n` / `\\\n` / ` \n` etc.). Includes any
186 /// trailing-space bytes plus the line ending.
187 HardBreak { start: usize, end: usize },
188}
189
190impl IrEvent {
191 /// The source byte range this event covers.
192 pub fn range(&self) -> (usize, usize) {
193 match self {
194 IrEvent::Text { start, end }
195 | IrEvent::Construct { start, end, .. }
196 | IrEvent::DelimRun { start, end, .. }
197 | IrEvent::OpenBracket { start, end, .. }
198 | IrEvent::SoftBreak { start, end }
199 | IrEvent::HardBreak { start, end } => (*start, *end),
200 IrEvent::CloseBracket { pos, .. } => (*pos, *pos + 1),
201 }
202 }
203}
204
205/// Categorical tag for a [`IrEvent::Construct`] event so emission knows
206/// which parser to call to rebuild the CST subtree.
207#[derive(Debug, Clone, Copy, PartialEq, Eq)]
208pub enum ConstructKind {
209 /// `\X` literal-character escape (CommonMark §2.4).
210 Escape,
211 /// `` `code` `` span (§6.1).
212 CodeSpan,
213 /// `<scheme://...>` or `<email@host>` (§6.5).
214 Autolink,
215 /// `<tag ...>` and friends (§6.6).
216 InlineHtml,
217 /// Pandoc opaque construct that doesn't have a dedicated kind yet
218 /// (currently: math spans). Pre-recognised in `build_ir` under
219 /// `Dialect::Pandoc` solely so the emphasis pass treats the entire
220 /// construct as opaque and delim runs inside don't cross its
221 /// boundary. Emission re-parses the construct via the dispatcher's
222 /// existing `try_parse_*` chain.
223 PandocOpaque,
224 /// Pandoc inline footnote `^[note text]`. Recognised in `build_ir`
225 /// under `Dialect::Pandoc` and consumed by the emission walk via
226 /// the IR's `ConstructPlan`. The dispatcher's legacy `^[` branch
227 /// is gated to CommonMark dialect only.
228 InlineFootnote,
229 /// Pandoc native span `<span ...>...</span>`. Recognised in
230 /// `build_ir` under `Dialect::Pandoc` and consumed by the emission
231 /// walk via the IR's `ConstructPlan`. The dispatcher's legacy
232 /// `<span>` branch is gated to CommonMark dialect only.
233 NativeSpan,
234 /// Pandoc footnote reference `[^id]`. Recognised in `build_ir`
235 /// under `Dialect::Pandoc` and consumed by the emission walk via
236 /// the IR's `ConstructPlan`. The dispatcher's legacy `[^id]`
237 /// branch is gated to CommonMark dialect only.
238 FootnoteReference,
239 /// Pandoc bracketed citation `[@key]`, `[see @key, p. 1]`,
240 /// `[@a; @b]`. Recognised in `build_ir` under `Dialect::Pandoc`
241 /// and consumed by the emission walk via the IR's `ConstructPlan`.
242 /// The dispatcher's legacy `[@cite]` branch is gated to CommonMark
243 /// dialect only.
244 BracketedCitation,
245 /// Pandoc bare citation `@key` or `-@key` (author-in-text /
246 /// suppress-author). Recognised in `build_ir` under
247 /// `Dialect::Pandoc` and consumed by the emission walk via the
248 /// IR's `ConstructPlan`. The dispatcher's legacy `@` and `-@`
249 /// branches are gated to CommonMark dialect only.
250 BareCitation,
251 /// Pandoc bracketed span `[content]{attrs}`. Recognised in
252 /// `build_ir` under `Dialect::Pandoc` and consumed by the emission
253 /// walk via the IR's `ConstructPlan`. The dispatcher's legacy
254 /// `[text]{attrs}` branch is gated to CommonMark dialect only.
255 BracketedSpan,
256}
257
258/// One matched fragment within a [`IrEvent::DelimRun`].
259#[derive(Debug, Clone, Copy)]
260pub struct DelimMatch {
261 /// Byte offset of this fragment relative to the run's `start`.
262 pub offset_in_run: u8,
263 /// Number of bytes in this fragment (1 or 2).
264 pub len: u8,
265 /// Whether this fragment is the opener (`true`) or closer of the pair.
266 pub is_opener: bool,
267 /// IR event index of the partner run.
268 pub partner_event: u32,
269 /// Byte offset within the partner run of the partner fragment.
270 pub partner_offset: u8,
271 /// Emphasis kind (Emph for `len == 1`, Strong for `len == 2`).
272 pub kind: EmphasisKind,
273}
274
275/// Successful bracket resolution: the `[`...`]` pair is a link or image.
276#[derive(Debug, Clone)]
277pub struct BracketResolution {
278 /// IR event index of the matching `CloseBracket`.
279 pub close_event: u32,
280 /// Source range of the link text (between `[`/`![` and `]`).
281 pub text_start: usize,
282 pub text_end: usize,
283 /// Source range of the link suffix (`(...)`, `[label]`, `[]`, or
284 /// empty for shortcut). When `kind == ShortcutReference`,
285 /// `suffix_start == suffix_end == close_pos + 1`.
286 pub suffix_start: usize,
287 pub suffix_end: usize,
288 pub kind: LinkKind,
289}
290
291/// What kind of link/image we resolved a bracket pair to.
292#[derive(Debug, Clone)]
293pub enum LinkKind {
294 /// `[text](dest)` or `[text](dest "title")`.
295 Inline { dest: String, title: Option<String> },
296 /// `[text][label]` — explicit reference.
297 FullReference { label: String },
298 /// `[text][]` — collapsed reference. Label is the link text.
299 CollapsedReference,
300 /// `[text]` — shortcut reference. Label is the link text.
301 ShortcutReference,
302}
303
304// ============================================================================
305// Pass 1: Scan
306// ============================================================================
307
308/// Scan `text[start..end]` once, producing a flat IR of events.
309///
310/// The scan is forward-only and never backtracks: each iteration either
311/// consumes a known construct (escape, code span, autolink, raw HTML),
312/// records a delim run / bracket marker / line break, or steps past a
313/// single UTF-8 boundary as plain text. Adjacent text bytes are coalesced
314/// into a single [`IrEvent::Text`] event by the run-flush step.
315pub fn build_ir(text: &str, start: usize, end: usize, config: &ParserOptions) -> Vec<IrEvent> {
316 let mut events = Vec::new();
317 build_ir_into(text, start, end, config, &mut events);
318 events
319}
320
321/// Like [`build_ir`] but writes into a caller-provided `Vec<IrEvent>`,
322/// clearing it first. Used by [`build_full_plans`] to amortise the
323/// per-call allocation through a thread-local scratch pool.
324pub(super) fn build_ir_into(
325 text: &str,
326 start: usize,
327 end: usize,
328 config: &ParserOptions,
329 events: &mut Vec<IrEvent>,
330) {
331 events.clear();
332 let bytes = text.as_bytes();
333 let exts = &config.extensions;
334 let is_commonmark = config.dialect == crate::options::Dialect::CommonMark;
335
336 let mut pos = start;
337 let mut text_run_start = start;
338 // Pandoc-only: extent of the current bracket-shape link/image's
339 // opaque range. While `pos < pandoc_bracket_extent`, autolinks /
340 // raw HTML / native spans are NOT recognised — pandoc-native
341 // treats `[link text]` as opaque to those constructs (CommonMark
342 // spec example #526 / #538). The lookahead at `[`/`![` sets this
343 // when a bracket-shape forms a valid link/image; once `pos`
344 // passes the extent, normal scanning resumes. CommonMark
345 // dialect's link-text-vs-autolink ordering is handled by the
346 // dispatcher's `try_parse_inline_link` rejecting outer matches
347 // when the link text contains a valid autolink (a different
348 // mechanism, see `LinkScanContext.skip_autolinks`).
349 let mut pandoc_bracket_extent: usize = 0;
350
351 // Pre-computed byte mask: `mask[b]` is `true` iff byte `b` could
352 // start any IR-recognised construct under the current dialect /
353 // extensions. Used to bulk-skip plain bytes between structural
354 // bytes — the per-byte branch chain below only runs at positions
355 // where a construct is actually possible. Non-ASCII bytes
356 // (>= 0x80) are never structural and are skipped together with
357 // ASCII plain text.
358 let mask = build_ir_byte_mask(config);
359
360 macro_rules! flush_text {
361 () => {
362 if pos > text_run_start {
363 events.push(IrEvent::Text {
364 start: text_run_start,
365 end: pos,
366 });
367 }
368 };
369 }
370
371 while pos < end {
372 // Fast-skip plain bytes. `text_run_start` is preserved across
373 // the skip so the next structural-event flush picks them up.
374 while pos < end && !mask[bytes[pos] as usize] {
375 pos += 1;
376 }
377 if pos >= end {
378 break;
379 }
380 let b = bytes[pos];
381
382 // Pandoc-only: at `[` or `![`, look ahead to see if this
383 // bracket-shape forms a valid link/image. If so, suppress
384 // autolink / raw HTML / native span recognition until `pos`
385 // passes the bracket-shape's end. Skipped if we're already
386 // inside an enclosing bracket-shape's opaque range.
387 if !is_commonmark
388 && pos >= pandoc_bracket_extent
389 && (b == b'[' || (b == b'!' && pos + 1 < end && bytes[pos + 1] == b'['))
390 && let Some(len) = try_pandoc_bracket_link_extent(text, pos, end, config)
391 {
392 pandoc_bracket_extent = pos + len;
393 }
394 let in_pandoc_bracket = !is_commonmark && pos < pandoc_bracket_extent;
395
396 // Backslash escape (§2.4) — including `\\\n` hard line break.
397 if b == b'\\'
398 && let Some((len, _ch, escape_type)) = try_parse_escape(&text[pos..])
399 && pos + len <= end
400 {
401 let enabled = match escape_type {
402 EscapeType::Literal => is_commonmark || exts.all_symbols_escapable,
403 EscapeType::HardLineBreak => exts.escaped_line_breaks,
404 EscapeType::NonbreakingSpace => exts.all_symbols_escapable,
405 };
406 if enabled {
407 flush_text!();
408 let kind = match escape_type {
409 EscapeType::HardLineBreak => {
410 events.push(IrEvent::HardBreak {
411 start: pos,
412 end: pos + len,
413 });
414 pos += len;
415 text_run_start = pos;
416 continue;
417 }
418 EscapeType::Literal | EscapeType::NonbreakingSpace => ConstructKind::Escape,
419 };
420 events.push(IrEvent::Construct {
421 start: pos,
422 end: pos + len,
423 kind,
424 });
425 pos += len;
426 text_run_start = pos;
427 continue;
428 }
429 }
430
431 // Code span (§6.1) — opaque to emphasis and brackets.
432 if b == b'`'
433 && let Some((len, _, _, _)) = try_parse_code_span(&text[pos..])
434 && pos + len <= end
435 {
436 flush_text!();
437 events.push(IrEvent::Construct {
438 start: pos,
439 end: pos + len,
440 kind: ConstructKind::CodeSpan,
441 });
442 pos += len;
443 text_run_start = pos;
444 continue;
445 }
446
447 // Pandoc-only: math spans are opaque to emphasis. The legacy
448 // `parse_until_closer_with_nested_*` skip-list includes inline
449 // math; without recognising it here, delim runs inside `$math$`
450 // would be picked up by the emphasis pass and break losslessness
451 // (the dispatcher's math parser would later re-claim the bytes,
452 // duplicating content).
453 if !is_commonmark && let Some(len) = try_pandoc_math_opaque(text, pos, end, config) {
454 flush_text!();
455 events.push(IrEvent::Construct {
456 start: pos,
457 end: pos + len,
458 kind: ConstructKind::PandocOpaque,
459 });
460 pos += len;
461 text_run_start = pos;
462 continue;
463 }
464
465 // Pandoc-only: native span `<span ...>...</span>`. Must come
466 // before the generic autolink/raw-html branches so the open tag
467 // doesn't get claimed as inline HTML. Span content is opaque to
468 // the emphasis pass; emission consumes the event via the IR's
469 // `ConstructPlan`. Suppressed inside Pandoc bracket-shape
470 // link/image text.
471 if !is_commonmark
472 && !in_pandoc_bracket
473 && b == b'<'
474 && exts.native_spans
475 && let Some((len, _, _)) = try_parse_native_span(&text[pos..])
476 && pos + len <= end
477 {
478 flush_text!();
479 events.push(IrEvent::Construct {
480 start: pos,
481 end: pos + len,
482 kind: ConstructKind::NativeSpan,
483 });
484 pos += len;
485 text_run_start = pos;
486 continue;
487 }
488
489 // Autolink (§6.5) before raw HTML — autolinks are the more
490 // specific shape inside `<...>`. Both are suppressed inside
491 // Pandoc bracket-shape link/image text (pandoc-native treats
492 // link text as opaque to autolinks and raw HTML).
493 if b == b'<' && !in_pandoc_bracket {
494 if exts.autolinks
495 && let Some((len, _)) = try_parse_autolink(&text[pos..], is_commonmark)
496 && pos + len <= end
497 {
498 flush_text!();
499 events.push(IrEvent::Construct {
500 start: pos,
501 end: pos + len,
502 kind: ConstructKind::Autolink,
503 });
504 pos += len;
505 text_run_start = pos;
506 continue;
507 }
508 if exts.raw_html
509 && let Some(len) = try_parse_inline_html(&text[pos..])
510 && pos + len <= end
511 {
512 flush_text!();
513 events.push(IrEvent::Construct {
514 start: pos,
515 end: pos + len,
516 kind: ConstructKind::InlineHtml,
517 });
518 pos += len;
519 text_run_start = pos;
520 continue;
521 }
522 }
523
524 // Pandoc-only: inline footnote `^[note]`. Recognized at scan
525 // time so the emphasis pass treats it as opaque (delim runs
526 // inside the footnote can't pair with delim runs outside).
527 if !is_commonmark
528 && b == b'^'
529 && exts.inline_footnotes
530 && let Some((len, _)) = try_parse_inline_footnote(&text[pos..])
531 && pos + len <= end
532 {
533 flush_text!();
534 events.push(IrEvent::Construct {
535 start: pos,
536 end: pos + len,
537 kind: ConstructKind::InlineFootnote,
538 });
539 pos += len;
540 text_run_start = pos;
541 continue;
542 }
543
544 // Pandoc-only: footnote reference `[^id]`. Recognised at scan
545 // time so the emphasis pass treats it as opaque (delim runs
546 // inside the label can't pair with delim runs outside) and the
547 // emission walk dispatches it directly via the IR's
548 // `ConstructPlan`. Must come before the generic bracket-opaque
549 // scan so the dedicated kind wins.
550 if !is_commonmark
551 && b == b'['
552 && pos + 1 < end
553 && bytes[pos + 1] == b'^'
554 && exts.footnotes
555 && let Some((len, _)) = try_parse_footnote_reference(&text[pos..])
556 && pos + len <= end
557 {
558 flush_text!();
559 events.push(IrEvent::Construct {
560 start: pos,
561 end: pos + len,
562 kind: ConstructKind::FootnoteReference,
563 });
564 pos += len;
565 text_run_start = pos;
566 continue;
567 }
568
569 // Pandoc-only: bracketed citation `[@cite]`. Recognised at
570 // scan time so the emphasis pass treats it as opaque (delim
571 // runs inside the citation can't pair with delim runs outside)
572 // and the emission walk dispatches it directly via the IR's
573 // `ConstructPlan`. Must come before the generic bracket-opaque
574 // scan so the dedicated kind wins.
575 if !is_commonmark
576 && b == b'['
577 && exts.citations
578 && let Some((len, _)) = try_parse_bracketed_citation(&text[pos..])
579 && pos + len <= end
580 {
581 flush_text!();
582 events.push(IrEvent::Construct {
583 start: pos,
584 end: pos + len,
585 kind: ConstructKind::BracketedCitation,
586 });
587 pos += len;
588 text_run_start = pos;
589 continue;
590 }
591
592 // Pandoc-only: bare citation `@key` or `-@key`. Recognised at
593 // scan time so the emission walk dispatches it directly via
594 // the IR's `ConstructPlan`. Bare citations don't contain
595 // emphasis-eligible content, so opacity is moot here — IR
596 // participation is only for dispatch consolidation.
597 if !is_commonmark
598 && (b == b'@' || (b == b'-' && pos + 1 < end && bytes[pos + 1] == b'@'))
599 && (exts.citations || exts.quarto_crossrefs)
600 && let Some((len, _, _)) = try_parse_bare_citation(&text[pos..])
601 && pos + len <= end
602 {
603 flush_text!();
604 events.push(IrEvent::Construct {
605 start: pos,
606 end: pos + len,
607 kind: ConstructKind::BareCitation,
608 });
609 pos += len;
610 text_run_start = pos;
611 continue;
612 }
613
614 // Pandoc-only: bracketed span `[content]{attrs}`. Recognised
615 // at scan time so the emphasis pass treats it as opaque (delim
616 // runs inside the span content can't pair with delim runs
617 // outside) and the emission walk dispatches it directly via
618 // the IR's `ConstructPlan`. Must come before the generic
619 // bracket-opaque scan so the dedicated kind wins.
620 // `try_parse_bracketed_span` requires `]` to be immediately
621 // followed by `{`, so this never shadows inline links
622 // (`[text](url)`) or reference links (`[label][refdef]`) —
623 // those don't have the `{attrs}` suffix.
624 if !is_commonmark
625 && b == b'['
626 && exts.bracketed_spans
627 && let Some((len, _, _)) = try_parse_bracketed_span(&text[pos..])
628 && pos + len <= end
629 {
630 flush_text!();
631 events.push(IrEvent::Construct {
632 start: pos,
633 end: pos + len,
634 kind: ConstructKind::BracketedSpan,
635 });
636 pos += len;
637 text_run_start = pos;
638 continue;
639 }
640
641 // `` form, or `reference_links` for the
644 // `![alt][label]` reference-image form (e.g. MultiMarkdown
645 // disables `inline_images` but uses reference images).
646 if b == b'!'
647 && pos + 1 < end
648 && bytes[pos + 1] == b'['
649 && (exts.inline_images || exts.reference_links)
650 {
651 flush_text!();
652 events.push(IrEvent::OpenBracket {
653 start: pos,
654 end: pos + 2,
655 is_image: true,
656 active: true,
657 resolution: None,
658 });
659 pos += 2;
660 text_run_start = pos;
661 continue;
662 }
663
664 // `[` opens a link bracket. Recognised whenever any
665 // link-producing extension is on — `inline_links` for
666 // `[text](url)`, or `reference_links` for `[text][label]` /
667 // `[text]` shortcut form.
668 if b == b'[' && (exts.inline_links || exts.reference_links) {
669 flush_text!();
670 events.push(IrEvent::OpenBracket {
671 start: pos,
672 end: pos + 1,
673 is_image: false,
674 active: true,
675 resolution: None,
676 });
677 pos += 1;
678 text_run_start = pos;
679 continue;
680 }
681
682 // `]` closes a link/image bracket.
683 if b == b']' {
684 flush_text!();
685 events.push(IrEvent::CloseBracket {
686 pos,
687 matched: false,
688 });
689 pos += 1;
690 text_run_start = pos;
691 continue;
692 }
693
694 // `*` or `_` delimiter run.
695 if b == b'*' || b == b'_' {
696 flush_text!();
697 let mut run_end = pos;
698 while run_end < end && bytes[run_end] == b {
699 run_end += 1;
700 }
701 let count = run_end - pos;
702 let (can_open, can_close) = compute_flanking(text, pos, count, b, config.dialect);
703 events.push(IrEvent::DelimRun {
704 ch: b,
705 start: pos,
706 end: run_end,
707 can_open,
708 can_close,
709 matches: Vec::new(),
710 });
711 pos = run_end;
712 text_run_start = pos;
713 continue;
714 }
715
716 // Hard line break: 2+ trailing spaces before newline. We detect
717 // this when we're sitting on a `\n` (or `\r\n`) and the preceding
718 // bytes within the current text run are spaces.
719 if b == b'\n' || (b == b'\r' && pos + 1 < end && bytes[pos + 1] == b'\n') {
720 // Count trailing spaces in the text accumulated so far.
721 let nl_len = if b == b'\r' { 2 } else { 1 };
722 let mut trailing_spaces = 0;
723 let mut s = pos;
724 while s > text_run_start && bytes[s - 1] == b' ' {
725 trailing_spaces += 1;
726 s -= 1;
727 }
728 if trailing_spaces >= 2 {
729 // Flush text *before* the trailing spaces.
730 if s > text_run_start {
731 events.push(IrEvent::Text {
732 start: text_run_start,
733 end: s,
734 });
735 }
736 events.push(IrEvent::HardBreak {
737 start: s,
738 end: pos + nl_len,
739 });
740 pos += nl_len;
741 text_run_start = pos;
742 continue;
743 }
744
745 // Soft line break: flush preceding text, emit the line ending
746 // as its own event so the emitter can render `NEWLINE` tokens
747 // verbatim.
748 flush_text!();
749 events.push(IrEvent::SoftBreak {
750 start: pos,
751 end: pos + nl_len,
752 });
753 pos += nl_len;
754 text_run_start = pos;
755 continue;
756 }
757
758 // Plain byte — advance one UTF-8 char.
759 let ch_len = text[pos..]
760 .chars()
761 .next()
762 .map_or(1, std::primitive::char::len_utf8);
763 pos += ch_len.max(1);
764 }
765
766 flush_text!();
767}
768
769/// Build a 256-entry mask: `mask[b]` is `true` iff byte `b` could start
770/// any IR-recognised construct under the current dialect / extensions.
771///
772/// This is the build-IR-specific superset of "is this byte interesting".
773/// Plain bytes between structural bytes are bulk-skipped via this mask
774/// in the [`build_ir`] hot loop; missing a byte here is a correctness
775/// bug (we'd skip past a real construct), but having extras only costs
776/// us a wasted branch round-trip.
777fn build_ir_byte_mask(config: &ParserOptions) -> [bool; 256] {
778 let mut mask = [false; 256];
779 let exts = &config.extensions;
780 let is_commonmark = config.dialect == crate::options::Dialect::CommonMark;
781
782 // Always structural for IR scanning:
783 // `\n` / `\r` — soft / hard breaks
784 // `\\` — escape, hard line break, backslash math
785 // `` ` `` — code span (IR construct)
786 // `*` / `_` — emphasis delim runs (IR core)
787 mask[b'\n' as usize] = true;
788 mask[b'\r' as usize] = true;
789 mask[b'\\' as usize] = true;
790 mask[b'`' as usize] = true;
791 mask[b'*' as usize] = true;
792 mask[b'_' as usize] = true;
793
794 // Brackets: scanned whenever any bracket-shaped construct is
795 // reachable. `]` is structural unconditionally if `[` is — the IR
796 // emits a CloseBracket event regardless of which opener variant
797 // matches. `!` is gated on image-producing extensions; the leading
798 // `!` of `![alt]` is the only image entry point.
799 if exts.inline_links
800 || exts.reference_links
801 || exts.inline_images
802 || exts.bracketed_spans
803 || exts.footnotes
804 || exts.citations
805 {
806 mask[b'[' as usize] = true;
807 mask[b']' as usize] = true;
808 }
809 if exts.inline_images || exts.reference_links {
810 mask[b'!' as usize] = true;
811 }
812
813 // `<` covers autolinks, raw HTML, and Pandoc native spans.
814 if exts.autolinks || exts.raw_html || (!is_commonmark && exts.native_spans) {
815 mask[b'<' as usize] = true;
816 }
817
818 // `^` covers Pandoc inline footnotes (`^[...]` recognised in IR
819 // under Pandoc dialect). CM dialect inline footnotes go through
820 // the dispatcher, not the IR.
821 if !is_commonmark && exts.inline_footnotes {
822 mask[b'^' as usize] = true;
823 }
824
825 // `@` covers Pandoc bare citation `@key` and `[@cite]`. The leading
826 // `[` of `[@cite]` is already in the mask via the bracket gate;
827 // gating `@` here also covers the bare-citation form.
828 if !is_commonmark && (exts.citations || exts.quarto_crossrefs) {
829 mask[b'@' as usize] = true;
830 // `-` only matters as the first byte of `-@cite`. Tracking it
831 // here avoids missing the suppress-author bare citation form.
832 mask[b'-' as usize] = true;
833 }
834
835 // `$` covers Pandoc dollar / GFM math. CM doesn't recognise math
836 // in `build_ir`.
837 if !is_commonmark
838 && (exts.tex_math_dollars
839 || exts.tex_math_gfm
840 || exts.tex_math_single_backslash
841 || exts.tex_math_double_backslash)
842 {
843 mask[b'$' as usize] = true;
844 }
845
846 mask
847}
848
849// ============================================================================
850// Flanking (CommonMark §6.2)
851// ============================================================================
852
853fn compute_flanking(
854 text: &str,
855 pos: usize,
856 count: usize,
857 ch: u8,
858 dialect: crate::options::Dialect,
859) -> (bool, bool) {
860 if dialect == crate::options::Dialect::Pandoc {
861 // Pandoc-markdown's recursive-descent emphasis parser does NOT
862 // apply CommonMark §6.2 flanking rules. Instead it gates on:
863 // - opener: must not be followed by whitespace (Pandoc
864 // `try_parse_emphasis` line 247 in legacy core.rs).
865 // - closer: no flanking gate at all (Pandoc-markdown's
866 // `ender` parser only counts characters; see Markdown.hs
867 // in pandoc/src/Text/Pandoc/Readers/Markdown.hs).
868 // - underscore intraword hard rule: `_` adjacent to an
869 // alphanumeric on either side cannot open / close
870 // (Pandoc's `intraword_underscores` extension default).
871 let prev_char = (pos > 0).then(|| text[..pos].chars().last()).flatten();
872 let next_char = text.get(pos + count..).and_then(|s| s.chars().next());
873 let followed_by_ws = next_char.is_none_or(|c| c.is_whitespace());
874
875 let mut can_open = !followed_by_ws;
876 // Pandoc-markdown's `ender` (in pandoc/Readers/Markdown.hs)
877 // has no flanking restriction on closers — just a count match.
878 // Set can_close unconditionally; the per-pair match logic in
879 // `process_emphasis_in_range_filtered` constrains pairing via
880 // the equal-count rule.
881 let mut can_close = true;
882
883 if ch == b'_' {
884 let prev_is_alnum = prev_char.is_some_and(|c| c.is_alphanumeric());
885 let next_is_alnum = next_char.is_some_and(|c| c.is_alphanumeric());
886 if prev_is_alnum {
887 can_open = false;
888 }
889 if next_is_alnum {
890 can_close = false;
891 }
892 }
893
894 return (can_open, can_close);
895 }
896
897 // CommonMark §6.2 flanking.
898 let lf = is_left_flanking(text, pos, count);
899 let rf = is_right_flanking(text, pos, count);
900 if ch == b'*' {
901 (lf, rf)
902 } else {
903 let prev_char = (pos > 0).then(|| text[..pos].chars().last()).flatten();
904 let next_char = text.get(pos + count..).and_then(|s| s.chars().next());
905 let preceded_by_punct = prev_char.is_some_and(is_unicode_punct_or_symbol);
906 let followed_by_punct = next_char.is_some_and(is_unicode_punct_or_symbol);
907 let can_open = lf && (!rf || preceded_by_punct);
908 let can_close = rf && (!lf || followed_by_punct);
909 (can_open, can_close)
910 }
911}
912
913/// Pandoc-only: identify a math span starting at `pos` and return its
914/// byte length. Tries `$math$` and `$$display$$` (gated on
915/// `tex_math_dollars`), GFM `$math$` (gated on `tex_math_gfm`), and the
916/// `\(math\)` / `\[math\]` / `\\(math\\)` / `\\[math\\]` backslash
917/// forms (gated on `tex_math_single_backslash` / `_double_backslash`).
918/// Math content is opaque to emphasis: `$a * b$` must not produce an
919/// emphasis closer at the inner `*`.
920fn try_pandoc_math_opaque(
921 text: &str,
922 pos: usize,
923 end: usize,
924 config: &ParserOptions,
925) -> Option<usize> {
926 let bytes = text.as_bytes();
927 let exts = &config.extensions;
928 let b = bytes[pos];
929
930 if exts.tex_math_dollars && b == b'$' {
931 if let Some((len, _)) = try_parse_display_math(&text[pos..])
932 && pos + len <= end
933 {
934 return Some(len);
935 }
936 if let Some((len, _)) = try_parse_inline_math(&text[pos..])
937 && pos + len <= end
938 {
939 return Some(len);
940 }
941 }
942 if exts.tex_math_gfm
943 && b == b'$'
944 && let Some((len, _)) = try_parse_gfm_inline_math(&text[pos..])
945 && pos + len <= end
946 {
947 return Some(len);
948 }
949 if exts.tex_math_double_backslash && b == b'\\' {
950 if let Some((len, _)) = try_parse_double_backslash_display_math(&text[pos..])
951 && pos + len <= end
952 {
953 return Some(len);
954 }
955 if let Some((len, _)) = try_parse_double_backslash_inline_math(&text[pos..])
956 && pos + len <= end
957 {
958 return Some(len);
959 }
960 }
961 if exts.tex_math_single_backslash && b == b'\\' {
962 if let Some((len, _)) = try_parse_single_backslash_display_math(&text[pos..])
963 && pos + len <= end
964 {
965 return Some(len);
966 }
967 if let Some((len, _)) = try_parse_single_backslash_inline_math(&text[pos..])
968 && pos + len <= end
969 {
970 return Some(len);
971 }
972 }
973 None
974}
975
976/// Pandoc-only: identify a bracket-shaped opaque construct starting at
977/// `pos` and return its byte length. Tries the dispatcher's precedence
978/// order:
979/// 1. `` inline image
980/// 2. `![alt][ref]` / `![alt]` reference image (shape-only opacity)
981/// 3. `[^id]` footnote reference
982/// 4. `[text](dest)` inline link
983/// 5. `[text][ref]` / `[text]` reference link (shape-only opacity)
984/// 6. `[@cite]` bracketed citation
985/// 7. `[text]{attrs}` bracketed span
986///
987/// Returns `None` if the bytes at `pos` don't open any recognised Pandoc
988/// bracket-shaped construct. In that case the scanner falls through to
989/// the generic `OpenBracket`/`CloseBracket` emission and the dispatcher
990/// emits the bracket bytes as literal text (or as plain emphasis if the
991/// pattern matches an opener).
992/// Lookahead helper: at a `[` or `![` byte under Pandoc dialect, return
993/// the total byte length of the bracket-shape link/image if it forms a
994/// valid one, else `None`. Used by `build_ir` to suppress autolink /
995/// raw HTML / native span recognition inside Pandoc link text —
996/// pandoc-native treats link text as opaque to those constructs
997/// (CommonMark spec example #526 / #538 differs). Mirrors the
998/// dispatcher's `try_parse_*` precedence so the lookahead, the IR's
999/// `process_brackets` resolution, and the dispatcher's emission agree
1000/// on the bracket-shape's byte boundaries.
1001fn try_pandoc_bracket_link_extent(
1002 text: &str,
1003 pos: usize,
1004 end: usize,
1005 config: &ParserOptions,
1006) -> Option<usize> {
1007 let bytes = text.as_bytes();
1008 let exts = &config.extensions;
1009 let ctx = LinkScanContext::from_options(config);
1010 let allow_shortcut = exts.shortcut_reference_links;
1011
1012 // `![...]` images.
1013 if bytes[pos] == b'!' {
1014 if pos + 1 >= end || bytes[pos + 1] != b'[' {
1015 return None;
1016 }
1017 if exts.inline_images
1018 && let Some((len, _, _, _)) = try_parse_inline_image(&text[pos..], ctx)
1019 && pos + len <= end
1020 {
1021 return Some(len);
1022 }
1023 if exts.reference_links
1024 && let Some((len, _, _, _)) = try_parse_reference_image(&text[pos..], allow_shortcut)
1025 && pos + len <= end
1026 {
1027 return Some(len);
1028 }
1029 return None;
1030 }
1031
1032 // `[...]` openers — try in dispatcher order. Footnote refs
1033 // (`[^id]`), bracketed citations (`[@cite]`), and bracketed spans
1034 // (`[text]{attrs}`) are recognised by their own dedicated branches
1035 // in `build_ir` and don't need this lookahead.
1036 if exts.inline_links
1037 && let Some((len, _, _, _)) = try_parse_inline_link(&text[pos..], false, ctx)
1038 && pos + len <= end
1039 {
1040 return Some(len);
1041 }
1042 if exts.reference_links
1043 && let Some((len, _, _, _)) =
1044 try_parse_reference_link(&text[pos..], allow_shortcut, exts.inline_links, ctx)
1045 && pos + len <= end
1046 {
1047 return Some(len);
1048 }
1049
1050 None
1051}
1052
1053fn is_unicode_punct_or_symbol(c: char) -> bool {
1054 if c.is_ascii() {
1055 c.is_ascii_punctuation()
1056 } else {
1057 !c.is_alphanumeric() && !c.is_whitespace()
1058 }
1059}
1060
1061fn is_left_flanking(text: &str, run_start: usize, run_len: usize) -> bool {
1062 let after = run_start + run_len;
1063 let next_char = text.get(after..).and_then(|s| s.chars().next());
1064 let prev_char = (run_start > 0)
1065 .then(|| text[..run_start].chars().last())
1066 .flatten();
1067
1068 let followed_by_ws = next_char.is_none_or(|c| c.is_whitespace());
1069 if followed_by_ws {
1070 return false;
1071 }
1072 let followed_by_punct = next_char.is_some_and(is_unicode_punct_or_symbol);
1073 if !followed_by_punct {
1074 return true;
1075 }
1076 prev_char.is_none_or(|c| c.is_whitespace() || is_unicode_punct_or_symbol(c))
1077}
1078
1079fn is_right_flanking(text: &str, run_start: usize, run_len: usize) -> bool {
1080 let after = run_start + run_len;
1081 let next_char = text.get(after..).and_then(|s| s.chars().next());
1082 let prev_char = (run_start > 0)
1083 .then(|| text[..run_start].chars().last())
1084 .flatten();
1085
1086 let preceded_by_ws = prev_char.is_none_or(|c| c.is_whitespace());
1087 if preceded_by_ws {
1088 return false;
1089 }
1090 let preceded_by_punct = prev_char.is_some_and(is_unicode_punct_or_symbol);
1091 if !preceded_by_punct {
1092 return true;
1093 }
1094 next_char.is_none_or(|c| c.is_whitespace() || is_unicode_punct_or_symbol(c))
1095}
1096
1097// ============================================================================
1098// Pass 2: Process emphasis (CommonMark §6.2)
1099// ============================================================================
1100
1101/// Run the CommonMark §6.3 `process_emphasis` algorithm over the IR's
1102/// delim runs. Mutates the IR in place: matched runs gain entries in their
1103/// `matches` vec, unmatched bytes stay implicit (the emission pass treats
1104/// any byte not covered by a match as literal text).
1105///
1106/// The algorithm tracks a per-bucket `openers_bottom` exclusive lower
1107/// bound to keep walk-back bounded; consume rules and the §6.2 mod-3
1108/// rejection match the reference implementation.
1109pub fn process_emphasis(events: &mut [IrEvent], dialect: crate::options::Dialect) {
1110 process_emphasis_in_range(events, 0, events.len(), dialect);
1111}
1112
1113/// Range-scoped variant of [`process_emphasis`].
1114///
1115/// Only delim runs whose IR event index lies in `[lo, hi)` are considered.
1116/// Used by [`build_full_plans`] to run emphasis pairing inside each
1117/// resolved bracket pair *before* the global top-level pass, so emphasis
1118/// can never form across a link's bracket boundary (CommonMark §6.3
1119/// requires bracket resolution to happen first when at a `]`, with
1120/// emphasis processed on the link's inner range).
1121///
1122/// The function additionally skips delim runs that already carry a
1123/// recorded match in their `matches` vec — this lets the second
1124/// (top-level) pass reuse the same algorithm without re-pairing bytes
1125/// already consumed by inner-range passes.
1126pub fn process_emphasis_in_range(
1127 events: &mut [IrEvent],
1128 lo: usize,
1129 hi: usize,
1130 dialect: crate::options::Dialect,
1131) {
1132 process_emphasis_in_range_filtered(events, lo, hi, None, dialect);
1133}
1134
1135/// Internal variant of [`process_emphasis_in_range`] with an optional
1136/// exclusion bitmap. Event indices for which `excluded[i] == true` are
1137/// treated as if their delim run were already fully consumed — used by
1138/// [`build_full_plans`] to keep the top-level emphasis pass from pairing
1139/// across a resolved bracket pair's boundary (the inner delim runs of
1140/// such a pair belong to the link's inner range and were already paired
1141/// by the scoped pass).
1142fn process_emphasis_in_range_filtered(
1143 events: &mut [IrEvent],
1144 lo: usize,
1145 hi: usize,
1146 excluded: Option<&[bool]>,
1147 dialect: crate::options::Dialect,
1148) {
1149 let is_commonmark = dialect == crate::options::Dialect::CommonMark;
1150 if is_commonmark {
1151 run_emphasis_pass(events, lo, hi, excluded, dialect, &[], false);
1152 return;
1153 }
1154 // Pandoc dialect: cascade-then-rerun. Run the standard pass, then
1155 // invalidate Emph/Strong pairs whose inner range contains an
1156 // unmatched same-char run with both can_open && can_close (Pandoc's
1157 // recursive descent would have failed those outer pairs because the
1158 // inner content has a stray, ambiguous delimiter the recursive
1159 // parser cannot pair). The invalidated pairs go into a "rejected
1160 // list" that the next iteration of the standard pass consults to
1161 // pick a different opener for the same closer (or reject the
1162 // closer altogether). Iterate to a fixed point.
1163 //
1164 // The rerun (iter 2+) runs in `strict` mode: a candidate pair is
1165 // rejected if its inner range contains an unmatched same-char run
1166 // with count > pair.count. This mirrors pandoc-markdown's
1167 // recursive-descent semantics where, e.g. inside a failed outer
1168 // `**...**` Strong, the inner `one c` parser's `option2`
1169 // (`string [c,c] >> two c mempty`) greedily consumes a stray `**`
1170 // and prevents subsequent `*` runs from pairing as Emph. Without
1171 // this gate, `**foo *bar** baz*` would produce Emph[bar** baz]
1172 // after the outer Strong invalidation, but pandoc treats it as
1173 // all-literal because the inner `**` blocks the Emph match.
1174 let mut rejected: Vec<(usize, usize)> = Vec::new();
1175 let max_iters = events.len().saturating_add(2);
1176 let mut iter = 0;
1177 loop {
1178 let strict = iter > 0;
1179 run_emphasis_pass(events, lo, hi, excluded, dialect, &rejected, strict);
1180 let invalidations = pandoc_cascade_invalidate(events);
1181 if invalidations.is_empty() {
1182 break;
1183 }
1184 rejected.extend(invalidations);
1185 iter += 1;
1186 if iter >= max_iters {
1187 break;
1188 }
1189 }
1190 // Recovery for `***A **B** C***` patterns: synthesise the inner
1191 // Strong match the standard delim-stack algorithm can't reach.
1192 pandoc_inner_strong_recovery(events);
1193}
1194
1195/// One pass of the CommonMark §6.2 emphasis pairing algorithm over the
1196/// IR's [`DelimRun`](IrEvent::DelimRun) events in `[lo, hi)`. Pandoc
1197/// dialect gates apply when `dialect == Dialect::Pandoc`. The
1198/// `rejected_pairs` list (Pandoc only) excludes specific
1199/// (opener_event_idx, closer_event_idx) pairs from matching — used by
1200/// the cascade-then-rerun loop to prevent invalidated pairs from
1201/// re-forming on the next iteration.
1202fn run_emphasis_pass(
1203 events: &mut [IrEvent],
1204 lo: usize,
1205 hi: usize,
1206 excluded: Option<&[bool]>,
1207 dialect: crate::options::Dialect,
1208 rejected_pairs: &[(usize, usize)],
1209 strict_pandoc: bool,
1210) {
1211 let is_commonmark = dialect == crate::options::Dialect::CommonMark;
1212 let hi = hi.min(events.len());
1213 if lo >= hi {
1214 return;
1215 }
1216 // Indices of DelimRun events within [lo, hi), in order, that have
1217 // not already been fully consumed by an earlier scoped pass and that
1218 // are not in the optional exclusion bitmap.
1219 let mut delim_idxs: Vec<usize> = events[lo..hi]
1220 .iter()
1221 .enumerate()
1222 .filter_map(|(i, e)| {
1223 let abs = lo + i;
1224 match e {
1225 IrEvent::DelimRun { matches, .. }
1226 if matches.is_empty()
1227 && excluded.is_none_or(|ex| ex.get(abs).copied() != Some(true)) =>
1228 {
1229 Some(abs)
1230 }
1231 _ => None,
1232 }
1233 })
1234 .collect();
1235 if delim_idxs.is_empty() {
1236 return;
1237 }
1238
1239 // Working state: count (remaining unmatched chars) and source_start
1240 // (first remaining char) per delim run. Indexed by position in
1241 // `delim_idxs`.
1242 let mut count: Vec<usize> = Vec::with_capacity(delim_idxs.len());
1243 let mut source_start: Vec<usize> = Vec::with_capacity(delim_idxs.len());
1244 let mut removed: Vec<bool> = vec![false; delim_idxs.len()];
1245
1246 for &ev_idx in &delim_idxs {
1247 if let IrEvent::DelimRun { start, end, .. } = &events[ev_idx] {
1248 count.push(end - start);
1249 source_start.push(*start);
1250 }
1251 }
1252
1253 // openers_bottom[ch_idx][len%3][can_open] → exclusive lower bound
1254 // (an index into `delim_idxs`, or None meaning "no bottom yet").
1255 let mut openers_bottom: [[[Option<usize>; 2]; 3]; 2] = [[[None; 2]; 3]; 2];
1256
1257 // First active index, scanning forward.
1258 let first_active =
1259 |removed: &[bool]| -> Option<usize> { (0..removed.len()).find(|&i| !removed[i]) };
1260 let next_active = |removed: &[bool], from: usize| -> Option<usize> {
1261 (from + 1..removed.len()).find(|&i| !removed[i])
1262 };
1263 let prev_active =
1264 |removed: &[bool], from: usize| -> Option<usize> { (0..from).rev().find(|&i| !removed[i]) };
1265
1266 let min_closer_count = 1usize;
1267 let mut closer_local = first_active(&removed);
1268 while let Some(c) = closer_local {
1269 let ev_c_idx = delim_idxs[c];
1270 let (ch_c, can_open_c, can_close_c) = match &events[ev_c_idx] {
1271 IrEvent::DelimRun {
1272 ch,
1273 can_open,
1274 can_close,
1275 ..
1276 } => (*ch, *can_open, *can_close),
1277 _ => unreachable!(),
1278 };
1279 if !can_close_c || removed[c] || count[c] < min_closer_count {
1280 closer_local = next_active(&removed, c);
1281 continue;
1282 }
1283
1284 let ch_idx = if ch_c == b'*' { 0 } else { 1 };
1285 let closer_mod = count[c] % 3;
1286 let closer_open_bucket = can_open_c as usize;
1287 let bottom = openers_bottom[ch_idx][closer_mod][closer_open_bucket];
1288
1289 // Walk back to find a compatible opener.
1290 let mut found_opener: Option<usize> = None;
1291 let mut walk = prev_active(&removed, c);
1292 while let Some(o) = walk {
1293 if Some(o) == bottom {
1294 break;
1295 }
1296 let ev_o_idx = delim_idxs[o];
1297 let (ch_o, can_open_o, can_close_o) = match &events[ev_o_idx] {
1298 IrEvent::DelimRun {
1299 ch,
1300 can_open,
1301 can_close,
1302 ..
1303 } => (*ch, *can_open, *can_close),
1304 _ => unreachable!(),
1305 };
1306 if !removed[o] && ch_o == ch_c && can_open_o {
1307 let oc_sum = count[o] + count[c];
1308 let opener_both = can_open_o && can_close_o;
1309 let closer_both = can_open_c && can_close_c;
1310 let mod3_reject = is_commonmark
1311 && (opener_both || closer_both)
1312 && oc_sum.is_multiple_of(3)
1313 && !(count[o].is_multiple_of(3) && count[c].is_multiple_of(3));
1314 // Pandoc-markdown rejects emph/strong pairs whose counts
1315 // disagree in the exactly-(1,2) / (2,1) shape:
1316 // - `**foo*` (2,1): `try_parse_two` looks only for a
1317 // `**` closer; the lone `*` doesn't satisfy that.
1318 // - `*foo**` (1,2): `try_parse_one` encountering `**`
1319 // tries `try_parse_two`; absence of an inner `**`
1320 // closer cascades the outer parse to fail.
1321 // Other count combinations DO match (verified against
1322 // `pandoc -f markdown`):
1323 // - (1,3) / (3,1) → emph match, opposite-side
1324 // leftover `**` literal.
1325 // - (2,3) / (3,2) → strong match, single `*` literal.
1326 // - (3,3) → STRONG(EM(...)) nested.
1327 // - (1..3, 4+) → match (Pandoc's ender walks the
1328 // closer run for a valid position; algorithm
1329 // consumes leftmost via leftover-as-literal).
1330 // Opener count >= 4 is rejected (Pandoc's
1331 // `try_parse_emphasis` has no count-4+ dispatch).
1332 let pandoc_reject = !is_commonmark
1333 && ((count[o] == 1 && count[c] == 2)
1334 || (count[o] == 2 && count[c] == 1)
1335 || count[o] >= 4);
1336 let pair_rejected = !is_commonmark && {
1337 let oe = delim_idxs[o];
1338 let ce = delim_idxs[c];
1339 rejected_pairs.iter().any(|&(ro, rc)| ro == oe && rc == ce)
1340 };
1341 // Pandoc strict-rerun gate (iter 2+ only): block a
1342 // candidate pair if any unmatched same-char run between
1343 // its opener and closer has remaining count strictly
1344 // greater than the consume rule for this pair.
1345 // Mirrors pandoc-markdown's recursive descent where
1346 // `one c`'s `option2` (`string [c,c] >> two c`) would
1347 // greedily consume a stray higher-count run, blocking
1348 // the outer `one c` from finding its `ender c 1` —
1349 // e.g. `**foo *bar** baz*` after the outer Strong
1350 // invalidates: a naïve rerun pairs ev1 (`*`) ↔ ev3
1351 // (`*`) as Emph (consume=1), but pandoc treats the
1352 // `**` between as having "consumed" any further
1353 // matching, leaving everything literal.
1354 let strict_block = strict_pandoc && {
1355 let tentative_consume = if !is_commonmark && count[o] >= 3 && count[c] >= 3 {
1356 1
1357 } else if count[o] >= 2 && count[c] >= 2 {
1358 2
1359 } else {
1360 1
1361 };
1362 let lo_evt = delim_idxs[o] + 1;
1363 let hi_evt = delim_idxs[c];
1364 (lo_evt..hi_evt).any(|k| match &events[k] {
1365 IrEvent::DelimRun {
1366 ch: ch_k,
1367 start,
1368 end,
1369 matches,
1370 ..
1371 } => {
1372 *ch_k == ch_c && {
1373 let total = end - start;
1374 let consumed: usize = matches.iter().map(|m| m.len as usize).sum();
1375 total.saturating_sub(consumed) > tentative_consume
1376 }
1377 }
1378 _ => false,
1379 })
1380 };
1381 if !mod3_reject && !pandoc_reject && !pair_rejected && !strict_block {
1382 found_opener = Some(o);
1383 break;
1384 }
1385 }
1386 if o == 0 {
1387 break;
1388 }
1389 walk = prev_active(&removed, o);
1390 }
1391
1392 if let Some(o) = found_opener {
1393 // Consume rule:
1394 // CommonMark — consume 2 (Strong) when both sides have
1395 // >= 2 chars, else 1 (Emph). For `***x***` (3,3) this
1396 // produces EM(STRONG(...)) because the first match
1397 // consumes 2 from each side (Strong outermost).
1398 // Pandoc — when both sides have >= 3, consume 1 first
1399 // (Emph innermost) leaving 2 + 2 to pair as Strong on
1400 // the second pass. This produces STRONG(EM(...)) for
1401 // `***x***`, matching Pandoc-markdown's recursive
1402 // `try_parse_three` algorithm.
1403 let consume = if !is_commonmark && count[o] >= 3 && count[c] >= 3 {
1404 1
1405 } else if count[o] >= 2 && count[c] >= 2 {
1406 2
1407 } else {
1408 1
1409 };
1410 let kind = if consume == 2 {
1411 EmphasisKind::Strong
1412 } else {
1413 EmphasisKind::Emph
1414 };
1415
1416 // Opener consumes inner-edge (rightmost) chars.
1417 let opener_match_offset =
1418 source_start[o] + count[o] - consume - source_start_event(&events[delim_idxs[o]]);
1419 // Closer consumes inner-edge (leftmost) chars.
1420 let closer_match_offset = source_start[c] - source_start_event(&events[delim_idxs[c]]);
1421
1422 // Record match on opener.
1423 if let IrEvent::DelimRun { matches, .. } = &mut events[delim_idxs[o]] {
1424 matches.push(DelimMatch {
1425 offset_in_run: opener_match_offset as u8,
1426 len: consume as u8,
1427 is_opener: true,
1428 partner_event: delim_idxs[c] as u32,
1429 partner_offset: closer_match_offset as u8,
1430 kind,
1431 });
1432 }
1433 // Record match on closer.
1434 if let IrEvent::DelimRun { matches, .. } = &mut events[delim_idxs[c]] {
1435 matches.push(DelimMatch {
1436 offset_in_run: closer_match_offset as u8,
1437 len: consume as u8,
1438 is_opener: false,
1439 partner_event: delim_idxs[o] as u32,
1440 partner_offset: opener_match_offset as u8,
1441 kind,
1442 });
1443 }
1444
1445 count[o] -= consume;
1446 source_start[c] += consume;
1447 count[c] -= consume;
1448
1449 // Remove all openers strictly between o and c.
1450 let mut between = next_active(&removed, o);
1451 while let Some(idx) = between {
1452 if idx == c {
1453 break;
1454 }
1455 removed[idx] = true;
1456 between = next_active(&removed, idx);
1457 }
1458
1459 if count[o] == 0 {
1460 removed[o] = true;
1461 }
1462 if count[c] == 0 {
1463 removed[c] = true;
1464 closer_local = next_active(&removed, c);
1465 }
1466 // Else re-process the same closer with reduced count.
1467 } else {
1468 openers_bottom[ch_idx][closer_mod][closer_open_bucket] = prev_active(&removed, c);
1469 if !can_open_c {
1470 removed[c] = true;
1471 }
1472 closer_local = next_active(&removed, c);
1473 }
1474 }
1475
1476 // No further mutation needed: matches are recorded; remaining bytes
1477 // stay implicit literal. Pandoc cascade is invoked by the caller
1478 // (`process_emphasis_in_range_filtered`) once per pass so it can
1479 // accumulate invalidations into a rejected-pairs list and re-run.
1480 let _ = (&mut delim_idxs, &mut openers_bottom, min_closer_count);
1481}
1482
1483/// Pandoc-only post-processing pass over [`process_emphasis_in_range_filtered`]
1484/// matches: invalidate any matched delim pair that contains an unmatched
1485/// same-character run between its opener and closer. Returns the list
1486/// of (opener_event_idx, closer_event_idx) pairs that were invalidated
1487/// in this call, so the caller can seed a rejected-pairs list and
1488/// re-run the standard pass — this lets Pandoc re-pair the inner runs
1489/// that the invalidated outer match would have stolen via
1490/// between-removal (e.g. `*foo **bar* baz**` → after the outer
1491/// `ev0..ev2` Emph is invalidated, `ev1..ev3` matches as Strong on the
1492/// next iteration).
1493fn pandoc_cascade_invalidate(events: &mut [IrEvent]) -> Vec<(usize, usize)> {
1494 let mut invalidated_pairs: Vec<(usize, usize)> = Vec::new();
1495 // Early-exit: if there are no `DelimRun` events at all, the cascade
1496 // pass is a no-op. Avoids allocating the two scratch vecs below for
1497 // every range with no `*`/`_` runs (which is the common case for
1498 // ranges that contain only standalone constructs / brackets).
1499 if !events.iter().any(|e| matches!(e, IrEvent::DelimRun { .. })) {
1500 return invalidated_pairs;
1501 }
1502 // Reuse two scratch vecs across the inner loop iterations instead
1503 // of `.collect()` each time. These are tiny per-paragraph
1504 // allocations but the function is called for every Pandoc inline
1505 // emphasis pass and shows up in malloc traffic.
1506 let mut total: Vec<usize> = Vec::with_capacity(events.len());
1507 let mut consumed: Vec<usize> = Vec::with_capacity(events.len());
1508 loop {
1509 total.clear();
1510 consumed.clear();
1511 // Compute total bytes (run length) and consumed bytes (sum of
1512 // match lens) per DelimRun event index.
1513 total.extend(events.iter().map(|e| match e {
1514 IrEvent::DelimRun { start, end, .. } => end - start,
1515 _ => 0,
1516 }));
1517 consumed.extend(events.iter().map(|e| match e {
1518 IrEvent::DelimRun { matches, .. } => matches.iter().map(|m| m.len as usize).sum(),
1519 _ => 0,
1520 }));
1521
1522 // Find a pair to invalidate. We invalidate one and restart so
1523 // the cascade can re-evaluate dependent pairs.
1524 let mut to_invalidate: Option<(usize, u8)> = None;
1525 'outer: for opener_idx in 0..events.len() {
1526 let IrEvent::DelimRun {
1527 ch: ch_o, matches, ..
1528 } = &events[opener_idx]
1529 else {
1530 continue;
1531 };
1532 for (mi, m) in matches.iter().enumerate() {
1533 if !m.is_opener {
1534 continue;
1535 }
1536 let closer_idx = m.partner_event as usize;
1537 if closer_idx <= opener_idx || closer_idx >= events.len() {
1538 continue;
1539 }
1540 // Scan events strictly between opener and closer for any
1541 // DelimRun with the same `ch`, unmatched bytes, AND
1542 // both `can_open` and `can_close` (i.e., the run could
1543 // have participated in pairing on both sides). A
1544 // can_open-only or can_close-only run is a one-sided
1545 // fragment (e.g. an isolated `*` after a backslash
1546 // escape) that the Pandoc recursive-descent path would
1547 // never have tried as a nested-strong opener — those
1548 // shouldn't cascade-invalidate the surrounding pair.
1549 for k in (opener_idx + 1)..closer_idx {
1550 if let IrEvent::DelimRun {
1551 ch: ch_k,
1552 can_open: co_k,
1553 can_close: cc_k,
1554 ..
1555 } = &events[k]
1556 && *ch_k == *ch_o
1557 && consumed[k] < total[k]
1558 && *co_k
1559 && *cc_k
1560 {
1561 to_invalidate = Some((opener_idx, mi as u8));
1562 break 'outer;
1563 }
1564 }
1565 }
1566 }
1567
1568 let Some((opener_idx, mi)) = to_invalidate else {
1569 break;
1570 };
1571
1572 // Look up the partner event/offset before mutating.
1573 let (closer_idx, opener_offset) = match &events[opener_idx] {
1574 IrEvent::DelimRun { matches, .. } => {
1575 let m = matches[mi as usize];
1576 (m.partner_event as usize, m.offset_in_run)
1577 }
1578 _ => break,
1579 };
1580
1581 // Remove the opener match.
1582 if let IrEvent::DelimRun { matches, .. } = &mut events[opener_idx] {
1583 matches.remove(mi as usize);
1584 }
1585 // Remove the corresponding closer match (closer's match has
1586 // is_opener=false and partner_offset == opener's offset_in_run).
1587 if let IrEvent::DelimRun { matches, .. } = &mut events[closer_idx] {
1588 matches.retain(|m| m.is_opener || m.partner_offset != opener_offset);
1589 }
1590 invalidated_pairs.push((opener_idx, closer_idx));
1591 }
1592 invalidated_pairs
1593}
1594
1595/// Pandoc-only post-pass: recover the inner Strong match in
1596/// `***A **B** C***` patterns where the IR's standard pass produced
1597/// `Emph[Strong[A], "B**...** C"]` (matching the outer triple as
1598/// Strong+Emph but losing the inner `**...**`-as-Strong-of-`C` pair).
1599///
1600/// Pandoc's recursive descent here goes
1601/// `three c → ender c 2 → one c → option2 → two c`, producing
1602/// `Emph[Strong[A], "B", Strong[C]]` — two Strong nodes inside an outer
1603/// Emph. The standard delim-stack algorithm can't reach this pairing
1604/// because between-removal during the outer Emph match removes the
1605/// inner closer-side `**` (e.g. `bar**`) from the candidate pool.
1606///
1607/// This recovery scans Emph matches whose opener and closer originally
1608/// had count >= 3, and whose closer has unmatched bytes >= 2 after the
1609/// standard pass; for each, we look for an unmatched same-char
1610/// between-run with count >= 2 and `can_close = true` (the would-be
1611/// inner-Strong opener) and synthesise a Strong match that consumes
1612/// the leftmost 2 bytes of the closer (where the existing Emph match
1613/// shifts to the rightmost 1 byte). The byte-position rewrite lets
1614/// the CST emission produce well-nested `Emph[..., Strong[...]]` —
1615/// outer Emph close at the rightmost outer-triple byte, inner Strong
1616/// close at the leftmost two.
1617fn pandoc_inner_strong_recovery(events: &mut [IrEvent]) {
1618 let n = events.len();
1619 // (between_idx, opener_idx, closer_idx, len)
1620 let mut to_apply: Vec<(usize, usize, usize, u8)> = Vec::new();
1621
1622 for opener_idx in 0..n {
1623 let (open_total, open_matches_clone, ch_o) = match &events[opener_idx] {
1624 IrEvent::DelimRun {
1625 start,
1626 end,
1627 matches,
1628 ch,
1629 ..
1630 } => (*end - *start, matches.clone(), *ch),
1631 _ => continue,
1632 };
1633 if open_total < 3 {
1634 continue;
1635 }
1636
1637 for m in open_matches_clone.iter() {
1638 if !m.is_opener || m.kind != EmphasisKind::Emph {
1639 continue;
1640 }
1641 let closer_idx = m.partner_event as usize;
1642 if closer_idx <= opener_idx || closer_idx >= n {
1643 continue;
1644 }
1645
1646 let (close_total, close_consumed) = match &events[closer_idx] {
1647 IrEvent::DelimRun {
1648 start,
1649 end,
1650 matches,
1651 ..
1652 } => {
1653 let total = end - start;
1654 let consumed: usize = matches.iter().map(|m| m.len as usize).sum();
1655 (total, consumed)
1656 }
1657 _ => continue,
1658 };
1659 if close_total < 3 {
1660 continue;
1661 }
1662 let leftover = close_total.saturating_sub(close_consumed);
1663 if leftover < 2 {
1664 continue;
1665 }
1666
1667 // Walk backward from closer-1 looking for the rightmost
1668 // unmatched same-char run with count >= 2 and
1669 // can_close=true.
1670 for k in ((opener_idx + 1)..closer_idx).rev() {
1671 if let IrEvent::DelimRun {
1672 ch,
1673 start,
1674 end,
1675 matches,
1676 can_close,
1677 ..
1678 } = &events[k]
1679 {
1680 if *ch != ch_o || !*can_close {
1681 continue;
1682 }
1683 let total = end - start;
1684 let consumed: usize = matches.iter().map(|m| m.len as usize).sum();
1685 let remaining = total.saturating_sub(consumed);
1686 if remaining < 2 {
1687 continue;
1688 }
1689 to_apply.push((k, opener_idx, closer_idx, 2));
1690 break;
1691 }
1692 }
1693 }
1694 }
1695
1696 for (between_idx, opener_idx, closer_idx, len) in to_apply {
1697 // Find the existing Emph match on the closer side.
1698 let (closer_emph_match_idx, closer_emph_offset) = {
1699 let mut found: Option<(usize, u8)> = None;
1700 if let IrEvent::DelimRun { matches, .. } = &events[closer_idx] {
1701 for (mi, m) in matches.iter().enumerate() {
1702 if !m.is_opener
1703 && m.partner_event as usize == opener_idx
1704 && m.kind == EmphasisKind::Emph
1705 {
1706 found = Some((mi, m.offset_in_run));
1707 break;
1708 }
1709 }
1710 }
1711 match found {
1712 Some(x) => x,
1713 None => continue,
1714 }
1715 };
1716
1717 // Find the corresponding Emph match on the opener side.
1718 let opener_emph_match_idx = {
1719 let mut found: Option<usize> = None;
1720 if let IrEvent::DelimRun { matches, .. } = &events[opener_idx] {
1721 for (mi, m) in matches.iter().enumerate() {
1722 if m.is_opener
1723 && m.partner_event as usize == closer_idx
1724 && m.kind == EmphasisKind::Emph
1725 {
1726 found = Some(mi);
1727 break;
1728 }
1729 }
1730 }
1731 match found {
1732 Some(x) => x,
1733 None => continue,
1734 }
1735 };
1736
1737 // Shift the Emph closer's offset to the right of the new
1738 // Strong closer's bytes (Strong takes leftmost `len` bytes,
1739 // Emph takes the next byte).
1740 let new_closer_emph_offset = closer_emph_offset + len;
1741
1742 // Update closer's Emph offset_in_run.
1743 if let IrEvent::DelimRun { matches, .. } = &mut events[closer_idx] {
1744 matches[closer_emph_match_idx].offset_in_run = new_closer_emph_offset;
1745 }
1746 // Update opener's Emph partner_offset to point at the shifted
1747 // Emph closer position.
1748 if let IrEvent::DelimRun { matches, .. } = &mut events[opener_idx] {
1749 matches[opener_emph_match_idx].partner_offset = new_closer_emph_offset;
1750 }
1751
1752 // Add Strong opener match on the between-run.
1753 if let IrEvent::DelimRun { matches, .. } = &mut events[between_idx] {
1754 matches.push(DelimMatch {
1755 offset_in_run: 0,
1756 len,
1757 is_opener: true,
1758 partner_event: closer_idx as u32,
1759 partner_offset: closer_emph_offset,
1760 kind: EmphasisKind::Strong,
1761 });
1762 }
1763 // Add Strong closer match on the closer (at the original
1764 // pre-shift Emph-closer position; the bytes that were the
1765 // single Emph closer now become the leftmost 2 bytes of the
1766 // Strong closer).
1767 if let IrEvent::DelimRun { matches, .. } = &mut events[closer_idx] {
1768 matches.push(DelimMatch {
1769 offset_in_run: closer_emph_offset,
1770 len,
1771 is_opener: false,
1772 partner_event: between_idx as u32,
1773 partner_offset: 0,
1774 kind: EmphasisKind::Strong,
1775 });
1776 }
1777 }
1778}
1779
1780fn source_start_event(event: &IrEvent) -> usize {
1781 match event {
1782 IrEvent::DelimRun { start, .. } => *start,
1783 _ => unreachable!("source_start_event called on non-DelimRun"),
1784 }
1785}
1786
1787// ============================================================================
1788// Pass 3: Process brackets (CommonMark §6.3)
1789// ============================================================================
1790
1791/// Resolve `[`/`![`/`]` markers into link/image nodes per CommonMark §6.3
1792/// (with Pandoc-aware variations under `Dialect::Pandoc`).
1793///
1794/// Walks the IR forward looking for `]` markers. For each one, finds the
1795/// nearest active matching `[`/`` or `[text](dest "title")`.
1799/// 2. Full reference: `[text][label]`, where `label` is in `refdefs`.
1800/// 3. Collapsed reference: `[text][]`, where `text` (normalised) is in
1801/// `refdefs`.
1802/// 4. Shortcut reference: `[text]` not followed by `(` or `[`, where
1803/// `text` (normalised) is in `refdefs`.
1804///
1805/// On a match, the opener gets a `BracketResolution` and the closer is
1806/// flagged `matched`. Under `Dialect::CommonMark`, all earlier active link
1807/// openers are deactivated to implement the §6.3 "links may not contain
1808/// other links" rule (image brackets do not deactivate earlier link
1809/// openers — only links do). Under `Dialect::Pandoc`, the deactivate-pass
1810/// is skipped: pandoc-native is outer-wins for nested links (the inner
1811/// `[inner](u2)` of `[link [inner](u2)](u1)` is literal text inside the
1812/// outer link), and the dispatcher enforces this via a `suppress_inner_links`
1813/// flag during LINK-text recursion. So under Pandoc the IR can leave both
1814/// outer and inner resolved and trust the dispatcher to suppress inner
1815/// LINK emission.
1816///
1817/// On a miss the bracket pair stays opaque-as-literal and the closer is
1818/// dropped from the bracket stack so the next `]` can re-pair.
1819///
1820/// Reference-form resolution under `Dialect::Pandoc` is shape-only: any
1821/// non-empty link text or label resolves regardless of refdef presence,
1822/// matching the historical legacy `reference_resolves`-returns-`true`
1823/// behavior. (Pandoc emits LINK nodes for unresolved shortcut/collapsed/
1824/// full-reference shapes so downstream features — linter, LSP, formatter
1825/// — have a typed wrapper to walk. Refdef-aware resolution under Pandoc
1826/// is bug #1/#2 territory and is a parser-linter-LSP cross-cut deferred
1827/// to a future workstream.)
1828pub fn process_brackets(
1829 events: &mut [IrEvent],
1830 text: &str,
1831 refdefs: Option<&RefdefMap>,
1832 dialect: crate::options::Dialect,
1833) {
1834 let empty: HashSet<String> = HashSet::new();
1835 let labels: &HashSet<String> = match refdefs {
1836 Some(map) => map.as_ref(),
1837 None => &empty,
1838 };
1839 let is_commonmark = dialect == crate::options::Dialect::CommonMark;
1840 // Under Pandoc, any non-empty reference label resolves shape-only —
1841 // matches the legacy `reference_resolves` short-circuit. Under
1842 // CommonMark, the refdef map is consulted.
1843 let label_resolves = |key_norm: &str| -> bool {
1844 !key_norm.is_empty() && (!is_commonmark || labels.contains(key_norm))
1845 };
1846
1847 // Walk forward through events, treating it as a linear scan for `]`.
1848 let mut i = 0;
1849 while i < events.len() {
1850 let close_pos = match &events[i] {
1851 IrEvent::CloseBracket { pos, .. } => *pos,
1852 _ => {
1853 i += 1;
1854 continue;
1855 }
1856 };
1857
1858 // Find the nearest active OpenBracket before `i`.
1859 let mut o = match find_active_opener(events, i) {
1860 Some(o) => o,
1861 None => {
1862 i += 1;
1863 continue;
1864 }
1865 };
1866
1867 let (open_end, is_image) = match &events[o] {
1868 IrEvent::OpenBracket { end, is_image, .. } => (*end, *is_image),
1869 _ => unreachable!(),
1870 };
1871 let text_start = open_end;
1872 let text_end = close_pos;
1873 let after_close = close_pos + 1;
1874
1875 // 1. Inline link / image.
1876 if let Some((suffix_end, dest, title)) = try_inline_suffix(text, after_close) {
1877 // §6.3 link-in-link rule (CommonMark): if this is a *link*
1878 // (not an image), and any earlier active link opener exists,
1879 // deactivate them. We also deactivate openers strictly before
1880 // `o` here because matching means the inner link wins; the
1881 // spec applies this *after* matching. Pandoc skips this —
1882 // outer-wins is enforced by the dispatcher's
1883 // `suppress_inner_links` flag during LINK-text recursion.
1884 if !is_image && is_commonmark {
1885 deactivate_earlier_link_openers(events, o);
1886 }
1887 commit_resolution(
1888 events,
1889 o,
1890 i,
1891 text_start,
1892 text_end,
1893 after_close,
1894 suffix_end,
1895 LinkKind::Inline { dest, title },
1896 );
1897 // Remove the opener from the bracket stack: it has been
1898 // matched (active=false will fall out automatically since
1899 // resolution is Some).
1900 mark_opener_resolved(events, o);
1901 i += 1;
1902 continue;
1903 }
1904
1905 // 2. Full reference link: `[text][label]`.
1906 let full_ref_suffix = try_full_reference_suffix(text, after_close);
1907 if let Some((suffix_end, label_raw)) = &full_ref_suffix {
1908 let label_norm = normalize_label(label_raw);
1909 if label_resolves(&label_norm) {
1910 if !is_image && is_commonmark {
1911 deactivate_earlier_link_openers(events, o);
1912 }
1913 commit_resolution(
1914 events,
1915 o,
1916 i,
1917 text_start,
1918 text_end,
1919 after_close,
1920 *suffix_end,
1921 LinkKind::FullReference {
1922 label: label_raw.clone(),
1923 },
1924 );
1925 mark_opener_resolved(events, o);
1926 i += 1;
1927 continue;
1928 }
1929 // Bracketed but unresolved label: §6.3 says we still treat
1930 // `[text][label]` as not-a-link, but the brackets get
1931 // consumed as literal text AND the shortcut form is
1932 // suppressed (since the `]` is followed by a link label).
1933 }
1934
1935 // 3. Collapsed `[]`.
1936 let link_text = &text[text_start..text_end];
1937 let link_text_norm = normalize_label(link_text);
1938 let is_collapsed = is_collapsed_marker(text, after_close);
1939 let collapsed_suffix_end = after_close + 2;
1940
1941 if is_collapsed && label_resolves(&link_text_norm) {
1942 if !is_image && is_commonmark {
1943 deactivate_earlier_link_openers(events, o);
1944 }
1945 commit_resolution(
1946 events,
1947 o,
1948 i,
1949 text_start,
1950 text_end,
1951 after_close,
1952 collapsed_suffix_end,
1953 LinkKind::CollapsedReference,
1954 );
1955 mark_opener_resolved(events, o);
1956 i += 1;
1957 continue;
1958 }
1959 // `[text][]` with text not in refdefs — falls through to
1960 // literal text; shortcut is suppressed (followed by `[]`).
1961
1962 // 4. Shortcut form: `[text]` not followed by `[]` or `[label]`.
1963 // Per CommonMark §6.3: "A shortcut reference link consists of a
1964 // link label that matches a link reference definition elsewhere
1965 // in the document and is not followed by [] or a link label."
1966 // The full-ref / collapsed shape attempts above suppress the
1967 // shortcut even when their labels don't resolve — the bracket
1968 // bytes still get consumed as literal text.
1969 let shortcut_suppressed = full_ref_suffix.is_some() || is_collapsed;
1970 if !shortcut_suppressed && label_resolves(&link_text_norm) {
1971 if !is_image && is_commonmark {
1972 deactivate_earlier_link_openers(events, o);
1973 }
1974 commit_resolution(
1975 events,
1976 o,
1977 i,
1978 text_start,
1979 text_end,
1980 after_close,
1981 after_close,
1982 LinkKind::ShortcutReference,
1983 );
1984 mark_opener_resolved(events, o);
1985 i += 1;
1986 continue;
1987 }
1988
1989 // No resolution. Drop the opener — its `]` partner is this one,
1990 // but since neither matched, the opener falls through to literal
1991 // text. We do this by deactivating the opener (so it won't be
1992 // considered for later `]` markers either).
1993 if let IrEvent::OpenBracket { active, .. } = &mut events[o] {
1994 *active = false;
1995 }
1996 let _ = &mut o;
1997 i += 1;
1998 }
1999}
2000
2001fn find_active_opener(events: &[IrEvent], close_idx: usize) -> Option<usize> {
2002 (0..close_idx).rev().find(|&i| {
2003 matches!(
2004 &events[i],
2005 IrEvent::OpenBracket {
2006 active: true,
2007 resolution: None,
2008 ..
2009 }
2010 )
2011 })
2012}
2013
2014fn deactivate_earlier_link_openers(events: &mut [IrEvent], open_idx: usize) {
2015 for ev in &mut events[..open_idx] {
2016 if let IrEvent::OpenBracket {
2017 is_image: false,
2018 active,
2019 resolution: None,
2020 ..
2021 } = ev
2022 {
2023 *active = false;
2024 }
2025 }
2026}
2027
2028fn mark_opener_resolved(events: &mut [IrEvent], open_idx: usize) {
2029 if let IrEvent::OpenBracket { active, .. } = &mut events[open_idx] {
2030 *active = false;
2031 }
2032}
2033
2034#[allow(clippy::too_many_arguments)]
2035fn commit_resolution(
2036 events: &mut [IrEvent],
2037 open_idx: usize,
2038 close_idx: usize,
2039 text_start: usize,
2040 text_end: usize,
2041 suffix_start: usize,
2042 suffix_end: usize,
2043 kind: LinkKind,
2044) {
2045 if let IrEvent::OpenBracket { resolution, .. } = &mut events[open_idx] {
2046 *resolution = Some(BracketResolution {
2047 close_event: close_idx as u32,
2048 text_start,
2049 text_end,
2050 suffix_start,
2051 suffix_end,
2052 kind,
2053 });
2054 }
2055 if let IrEvent::CloseBracket { matched, .. } = &mut events[close_idx] {
2056 *matched = true;
2057 }
2058}
2059
2060/// Try to parse `(dest)` or `(dest "title")` inline link suffix starting
2061/// at `text[pos]`. Returns `(end_pos_exclusive, dest, title)`.
2062fn try_inline_suffix(text: &str, pos: usize) -> Option<(usize, String, Option<String>)> {
2063 let bytes = text.as_bytes();
2064 if pos >= bytes.len() || bytes[pos] != b'(' {
2065 return None;
2066 }
2067 let mut p = pos + 1;
2068 // Skip leading whitespace.
2069 while p < bytes.len() && matches!(bytes[p], b' ' | b'\t' | b'\n') {
2070 p += 1;
2071 }
2072 // Empty `()` — link with empty destination.
2073 if p < bytes.len() && bytes[p] == b')' {
2074 return Some((p + 1, String::new(), None));
2075 }
2076
2077 // Parse destination.
2078 let (dest, dest_end) = parse_link_destination(text, p)?;
2079 p = dest_end;
2080
2081 // Skip whitespace.
2082 while p < bytes.len() && matches!(bytes[p], b' ' | b'\t' | b'\n') {
2083 p += 1;
2084 }
2085
2086 // Optional title.
2087 let mut title = None;
2088 if p < bytes.len() && matches!(bytes[p], b'"' | b'\'' | b'(') {
2089 let (t, t_end) = parse_link_title(text, p)?;
2090 title = Some(t);
2091 p = t_end;
2092 while p < bytes.len() && matches!(bytes[p], b' ' | b'\t' | b'\n') {
2093 p += 1;
2094 }
2095 }
2096
2097 if p >= bytes.len() || bytes[p] != b')' {
2098 return None;
2099 }
2100 Some((p + 1, dest, title))
2101}
2102
2103fn parse_link_destination(text: &str, start: usize) -> Option<(String, usize)> {
2104 let bytes = text.as_bytes();
2105 if start >= bytes.len() {
2106 return None;
2107 }
2108 if bytes[start] == b'<' {
2109 // <bracketed>
2110 let mut p = start + 1;
2111 let begin = p;
2112 while p < bytes.len() && bytes[p] != b'>' && bytes[p] != b'\n' && bytes[p] != b'<' {
2113 if bytes[p] == b'\\' && p + 1 < bytes.len() {
2114 p += 2;
2115 } else {
2116 p += 1;
2117 }
2118 }
2119 if p >= bytes.len() || bytes[p] != b'>' {
2120 return None;
2121 }
2122 let dest = text[begin..p].to_string();
2123 Some((dest, p + 1))
2124 } else {
2125 // unbracketed: balanced parens, no spaces, no controls
2126 let mut p = start;
2127 let mut paren_depth: i32 = 0;
2128 while p < bytes.len() {
2129 let b = bytes[p];
2130 if b == b'\\' && p + 1 < bytes.len() {
2131 p += 2;
2132 continue;
2133 }
2134 if b == b'(' {
2135 paren_depth += 1;
2136 p += 1;
2137 continue;
2138 }
2139 if b == b')' {
2140 if paren_depth == 0 {
2141 break;
2142 }
2143 paren_depth -= 1;
2144 p += 1;
2145 continue;
2146 }
2147 if b == b' ' || b == b'\t' || b == b'\n' || b < 0x20 || b == 0x7f {
2148 break;
2149 }
2150 p += 1;
2151 }
2152 if p == start || paren_depth != 0 {
2153 return None;
2154 }
2155 Some((text[start..p].to_string(), p))
2156 }
2157}
2158
2159fn parse_link_title(text: &str, start: usize) -> Option<(String, usize)> {
2160 let bytes = text.as_bytes();
2161 let q = bytes[start];
2162 let close = match q {
2163 b'"' => b'"',
2164 b'\'' => b'\'',
2165 b'(' => b')',
2166 _ => return None,
2167 };
2168 let mut p = start + 1;
2169 let begin = p;
2170 while p < bytes.len() {
2171 let b = bytes[p];
2172 if b == b'\\' && p + 1 < bytes.len() {
2173 p += 2;
2174 continue;
2175 }
2176 if b == close {
2177 let title = text[begin..p].to_string();
2178 return Some((title, p + 1));
2179 }
2180 p += 1;
2181 }
2182 None
2183}
2184
2185/// Try to parse `[label]` after a `]`. Returns `(suffix_end, label_raw)`.
2186/// For the collapsed form `[]`, returns `None` here (handled separately
2187/// by `is_collapsed_marker`).
2188fn try_full_reference_suffix(text: &str, pos: usize) -> Option<(usize, String)> {
2189 let bytes = text.as_bytes();
2190 if pos >= bytes.len() || bytes[pos] != b'[' {
2191 return None;
2192 }
2193 let label_start = pos + 1;
2194 let mut p = label_start;
2195 let mut escape_next = false;
2196 while p < bytes.len() {
2197 if escape_next {
2198 escape_next = false;
2199 p += 1;
2200 continue;
2201 }
2202 match bytes[p] {
2203 b'\\' => {
2204 escape_next = true;
2205 p += 1;
2206 }
2207 b']' => break,
2208 b'[' => return None,
2209 b'\n' => {
2210 p += 1;
2211 }
2212 _ => p += 1,
2213 }
2214 }
2215 if p >= bytes.len() || bytes[p] != b']' {
2216 return None;
2217 }
2218 let label = text[label_start..p].to_string();
2219 if label.is_empty() {
2220 return None;
2221 }
2222 Some((p + 1, label))
2223}
2224
2225fn is_collapsed_marker(text: &str, pos: usize) -> bool {
2226 text.as_bytes().get(pos) == Some(&b'[') && text.as_bytes().get(pos + 1) == Some(&b']')
2227}
2228
2229// ============================================================================
2230// Bracket plan — byte-position-keyed view of resolved brackets, consumed by
2231// the existing emission walk in `core::parse_inline_range_impl`.
2232// ============================================================================
2233
2234/// Disposition of a single bracket byte after [`process_brackets`].
2235#[derive(Debug, Clone)]
2236pub enum BracketDispo {
2237 /// `[` or `![` of a resolved link/image. Emission emits the LINK/IMAGE
2238 /// node and skips past `suffix_end`.
2239 Open {
2240 is_image: bool,
2241 text_start: usize,
2242 text_end: usize,
2243 suffix_start: usize,
2244 suffix_end: usize,
2245 kind: LinkKind,
2246 },
2247 /// Bracket byte (one of `[`, `]`, or `!`) that fell through to literal
2248 /// text. Emission accumulates into the surrounding text run.
2249 Literal,
2250}
2251
2252/// A byte-keyed view of the IR's bracket resolutions.
2253#[derive(Debug, Default, Clone)]
2254pub struct BracketPlan {
2255 by_pos: BTreeMap<usize, BracketDispo>,
2256}
2257
2258impl BracketPlan {
2259 pub fn lookup(&self, pos: usize) -> Option<&BracketDispo> {
2260 self.by_pos.get(&pos)
2261 }
2262
2263 pub fn is_empty(&self) -> bool {
2264 self.by_pos.is_empty()
2265 }
2266}
2267
2268/// A standalone Pandoc inline construct recognised by `build_ir` and
2269/// dispatched directly from the emission walk. Carries the construct's
2270/// full source range so the emission walk can slice the content for the
2271/// existing `emit_*` helpers without re-running the recognition.
2272#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2273pub enum ConstructDispo {
2274 /// `^[note text]` — emit via `emit_inline_footnote` after slicing
2275 /// the inner content.
2276 InlineFootnote { end: usize },
2277 /// `<span ...>...</span>` — emit via `emit_native_span` after
2278 /// re-parsing the open-tag attributes from the source range.
2279 NativeSpan { end: usize },
2280 /// `[^id]` — emit via `emit_footnote_reference` after extracting
2281 /// the label id from the source range.
2282 FootnoteReference { end: usize },
2283 /// `[@cite]` — emit via `emit_bracketed_citation` after slicing
2284 /// the inner content.
2285 BracketedCitation { end: usize },
2286 /// `@key` or `-@key` — emit via `emit_bare_citation` (or
2287 /// `emit_crossref` when `is_quarto_crossref_key` matches and
2288 /// `extensions.quarto_crossrefs` is enabled).
2289 BareCitation { end: usize },
2290 /// `[content]{attrs}` — emit via `emit_bracketed_span` after
2291 /// slicing the inner content and attribute string.
2292 BracketedSpan { end: usize },
2293}
2294
2295/// A byte-keyed view of the IR's standalone Pandoc constructs that the
2296/// emission walk consumes directly: inline footnotes, native spans,
2297/// footnote references, bracketed citations, bare citations, and
2298/// bracketed spans. Recognition is authoritative in `build_ir` under
2299/// `Dialect::Pandoc`; the dispatcher's legacy branches for these
2300/// constructs (`^[`, `<span>`, `[^id]`, `[@cite]`, `@cite` / `-@cite`,
2301/// `[text]{attrs}`) are gated to `Dialect::CommonMark` only and only
2302/// fire when the relevant extension is explicitly enabled.
2303#[derive(Debug, Default, Clone)]
2304pub struct ConstructPlan {
2305 by_pos: BTreeMap<usize, ConstructDispo>,
2306}
2307
2308impl ConstructPlan {
2309 pub fn lookup(&self, pos: usize) -> Option<&ConstructDispo> {
2310 self.by_pos.get(&pos)
2311 }
2312
2313 pub fn is_empty(&self) -> bool {
2314 self.by_pos.is_empty()
2315 }
2316}
2317
2318/// Build a [`ConstructPlan`] from the resolved IR. Each
2319/// `Construct { kind: InlineFootnote | NativeSpan, .. }` becomes one
2320/// entry keyed at its start byte.
2321pub fn build_construct_plan(events: &[IrEvent]) -> ConstructPlan {
2322 let mut by_pos: BTreeMap<usize, ConstructDispo> = BTreeMap::new();
2323 for ev in events {
2324 if let IrEvent::Construct { start, end, kind } = ev {
2325 match kind {
2326 ConstructKind::InlineFootnote => {
2327 by_pos.insert(*start, ConstructDispo::InlineFootnote { end: *end });
2328 }
2329 ConstructKind::NativeSpan => {
2330 by_pos.insert(*start, ConstructDispo::NativeSpan { end: *end });
2331 }
2332 ConstructKind::FootnoteReference => {
2333 by_pos.insert(*start, ConstructDispo::FootnoteReference { end: *end });
2334 }
2335 ConstructKind::BracketedCitation => {
2336 by_pos.insert(*start, ConstructDispo::BracketedCitation { end: *end });
2337 }
2338 ConstructKind::BareCitation => {
2339 by_pos.insert(*start, ConstructDispo::BareCitation { end: *end });
2340 }
2341 ConstructKind::BracketedSpan => {
2342 by_pos.insert(*start, ConstructDispo::BracketedSpan { end: *end });
2343 }
2344 _ => {}
2345 }
2346 }
2347 }
2348 ConstructPlan { by_pos }
2349}
2350
2351/// Build a [`BracketPlan`] from the resolved IR. Each `OpenBracket`
2352/// resolution becomes an [`BracketDispo::Open`] keyed at the opener's
2353/// start byte. Unresolved openers and unmatched closers become
2354/// `BracketDispo::Literal` so the emission path can recognise them
2355/// without re-parsing.
2356pub fn build_bracket_plan(events: &[IrEvent]) -> BracketPlan {
2357 let mut by_pos: BTreeMap<usize, BracketDispo> = BTreeMap::new();
2358 for ev in events {
2359 match ev {
2360 IrEvent::OpenBracket {
2361 start,
2362 is_image,
2363 resolution: Some(res),
2364 ..
2365 } => {
2366 by_pos.insert(
2367 *start,
2368 BracketDispo::Open {
2369 is_image: *is_image,
2370 text_start: res.text_start,
2371 text_end: res.text_end,
2372 suffix_start: res.suffix_start,
2373 suffix_end: res.suffix_end,
2374 kind: res.kind.clone(),
2375 },
2376 );
2377 }
2378 IrEvent::OpenBracket {
2379 start,
2380 is_image,
2381 resolution: None,
2382 ..
2383 } => {
2384 let len = if *is_image { 2 } else { 1 };
2385 for off in 0..len {
2386 by_pos.insert(*start + off, BracketDispo::Literal);
2387 }
2388 }
2389 IrEvent::CloseBracket {
2390 pos,
2391 matched: false,
2392 } => {
2393 by_pos.insert(*pos, BracketDispo::Literal);
2394 }
2395 _ => {}
2396 }
2397 }
2398 BracketPlan { by_pos }
2399}
2400
2401/// One-shot helper: build the IR, run all passes, and return the
2402/// bundled [`InlinePlans`] (emphasis dispositions, bracket resolutions,
2403/// and standalone Pandoc constructs) — packaged together so the inline
2404/// emission path can consume them in one go for either dialect.
2405///
2406/// Pass ordering follows the CommonMark §6.3 reference impl: bracket
2407/// resolution runs first, then emphasis is processed *scoped per resolved
2408/// bracket pair's inner event range*, then once more on the residual
2409/// top-level events. This prevents emphasis pairs from forming across a
2410/// link's bracket boundary, which the previous "all-emphasis-then-all-
2411/// brackets" order got wrong (e.g. spec example #473).
2412pub fn build_full_plans(
2413 text: &str,
2414 start: usize,
2415 end: usize,
2416 config: &ParserOptions,
2417) -> InlinePlans {
2418 let mut scratch = ScratchEvents::checkout();
2419 let bundle = scratch.inner.as_mut().unwrap();
2420 bundle.events.clear();
2421 bundle.bracket_pairs.clear();
2422 bundle.excluded.clear();
2423
2424 build_ir_into(text, start, end, config, &mut bundle.events);
2425 // §6.3 bracket resolution runs for both dialects. Under CommonMark
2426 // it enforces refdef-aware shortcut/collapsed/full-ref resolution
2427 // and the §6.3 link-in-link deactivation rule. Under Pandoc it
2428 // performs shape-only resolution (any non-empty label resolves) and
2429 // skips the deactivation pass — pandoc-native is outer-wins for
2430 // nested links and the dispatcher's `suppress_inner_links` flag
2431 // suppresses inner LINK emission during LINK-text recursion.
2432 process_brackets(
2433 &mut bundle.events,
2434 text,
2435 config.refdef_labels.as_ref(),
2436 config.dialect,
2437 );
2438
2439 // Scoped emphasis pass per resolved bracket pair, innermost first.
2440 // We collect (open_idx, close_idx) pairs of resolved brackets and run
2441 // emphasis only over the events strictly between them. Innermost-first
2442 // ordering matters: an outer link wraps emphasis that wraps an inner
2443 // link, and the inner link's inner range must be paired before the
2444 // outer's inner range so the top-level pass sees consistent state.
2445 bundle.bracket_pairs.extend(
2446 bundle
2447 .events
2448 .iter()
2449 .enumerate()
2450 .filter_map(|(i, ev)| match ev {
2451 IrEvent::OpenBracket {
2452 resolution: Some(res),
2453 ..
2454 } => Some((i, res.close_event as usize)),
2455 _ => None,
2456 }),
2457 );
2458 // Innermost-first: sort by close_idx ascending, then open_idx descending.
2459 bundle
2460 .bracket_pairs
2461 .sort_by(|a, b| a.1.cmp(&b.1).then(b.0.cmp(&a.0)));
2462 // Iterate pairs by index so we can hold &mut bundle.events while
2463 // reading bundle.bracket_pairs (split borrow on disjoint fields).
2464 for i in 0..bundle.bracket_pairs.len() {
2465 let (open_idx, close_idx) = bundle.bracket_pairs[i];
2466 process_emphasis_in_range(&mut bundle.events, open_idx + 1, close_idx, config.dialect);
2467 }
2468
2469 // Top-level emphasis pass: handles delim runs that fall outside any
2470 // resolved bracket pair.
2471 let len = bundle.events.len();
2472 if bundle.bracket_pairs.is_empty() {
2473 // Fast path: no resolved brackets means no exclusion mask needed —
2474 // skip the resize-and-fill pass entirely. Common for prose
2475 // paragraphs without inline links.
2476 process_emphasis_in_range_filtered(&mut bundle.events, 0, len, None, config.dialect);
2477 } else {
2478 // Build exclusion bitmap: any delim run whose event index lies
2479 // inside a resolved bracket pair is excluded from the top-level
2480 // pass. Implements the §6.3 boundary rule: emphasis at the top
2481 // level must not pair across a link's brackets.
2482 bundle.excluded.resize(len, false);
2483 for &(open_idx, close_idx) in &bundle.bracket_pairs {
2484 for slot in bundle
2485 .excluded
2486 .iter_mut()
2487 .take(close_idx)
2488 .skip(open_idx + 1)
2489 {
2490 *slot = true;
2491 }
2492 }
2493 process_emphasis_in_range_filtered(
2494 &mut bundle.events,
2495 0,
2496 len,
2497 Some(&bundle.excluded),
2498 config.dialect,
2499 );
2500 }
2501
2502 InlinePlans {
2503 emphasis: build_emphasis_plan(&bundle.events),
2504 brackets: build_bracket_plan(&bundle.events),
2505 constructs: build_construct_plan(&bundle.events),
2506 }
2507}
2508
2509/// Thread-local pool of scratch buffers used by [`build_full_plans`].
2510///
2511/// `build_full_plans` checks out one bundle for the duration of the call
2512/// and returns it on drop so the next call (or a recursive nested call
2513/// from an inline emitter) reuses the allocations. The pool is
2514/// per-thread — the parser is single-threaded — and bounded so a
2515/// long-running editor session can't accumulate stale capacity.
2516struct ScratchEvents {
2517 inner: Option<ScratchBundle>,
2518}
2519
2520#[derive(Default)]
2521struct ScratchBundle {
2522 events: Vec<IrEvent>,
2523 bracket_pairs: Vec<(usize, usize)>,
2524 excluded: Vec<bool>,
2525}
2526
2527thread_local! {
2528 static IR_EVENT_POOL: std::cell::RefCell<Vec<ScratchBundle>> =
2529 const { std::cell::RefCell::new(Vec::new()) };
2530}
2531
2532impl ScratchEvents {
2533 fn checkout() -> Self {
2534 let bundle = IR_EVENT_POOL
2535 .with(|p| p.borrow_mut().pop())
2536 .unwrap_or_default();
2537 Self {
2538 inner: Some(bundle),
2539 }
2540 }
2541}
2542
2543impl Drop for ScratchEvents {
2544 fn drop(&mut self) {
2545 if let Some(mut bundle) = self.inner.take() {
2546 bundle.events.clear();
2547 bundle.bracket_pairs.clear();
2548 bundle.excluded.clear();
2549 // Cap pool depth at 8 (deepest realistic nested-link recursion)
2550 // and drop any bundle whose `events` grew past 8K (a single
2551 // pathological paragraph shouldn't pin a huge allocation
2552 // forever).
2553 if bundle.events.capacity() <= 8192 {
2554 IR_EVENT_POOL.with(|p| {
2555 let mut pool = p.borrow_mut();
2556 if pool.len() < 8 {
2557 pool.push(bundle);
2558 }
2559 });
2560 }
2561 }
2562 }
2563}
2564
2565/// Bundle of plans produced by [`build_full_plans`] and consumed by the
2566/// inline emission walk.
2567#[derive(Debug, Default, Clone)]
2568pub struct InlinePlans {
2569 pub emphasis: EmphasisPlan,
2570 pub brackets: BracketPlan,
2571 pub constructs: ConstructPlan,
2572}
2573
2574/// Convert the IR's delim-run match decisions into an [`EmphasisPlan`],
2575/// preserving the byte-keyed disposition shape the existing emission walk
2576/// consumes.
2577///
2578/// Each match on a [`DelimRun`](IrEvent::DelimRun) produces one entry in
2579/// the plan: the opener side records `Open` with the partner's source
2580/// byte and length; the closer side records `Close`. Bytes within a run
2581/// that are *not* covered by any match get a `Literal` entry, which the
2582/// emission walk uses to coalesce unmatched delimiter bytes with
2583/// surrounding plain text.
2584pub fn build_emphasis_plan(events: &[IrEvent]) -> EmphasisPlan {
2585 let mut by_pos: BTreeMap<usize, DelimChar> = BTreeMap::new();
2586 for ev in events {
2587 if let IrEvent::DelimRun {
2588 start,
2589 end,
2590 matches,
2591 ..
2592 } = ev
2593 {
2594 for m in matches {
2595 let pos = *start + m.offset_in_run as usize;
2596 let partner_run_start = match &events[m.partner_event as usize] {
2597 IrEvent::DelimRun { start: ps, .. } => *ps,
2598 _ => continue,
2599 };
2600 let partner_pos = partner_run_start + m.partner_offset as usize;
2601 if m.is_opener {
2602 by_pos.insert(
2603 pos,
2604 DelimChar::Open {
2605 len: m.len,
2606 partner: partner_pos,
2607 partner_len: m.len,
2608 kind: m.kind,
2609 },
2610 );
2611 } else {
2612 by_pos.insert(pos, DelimChar::Close);
2613 }
2614 }
2615 // Any remaining bytes (not covered by a match) are literal.
2616 for pos in *start..*end {
2617 by_pos.entry(pos).or_insert(DelimChar::Literal);
2618 }
2619 }
2620 }
2621 EmphasisPlan::from_dispositions(by_pos)
2622}
2623
2624#[cfg(test)]
2625mod tests {
2626 use super::*;
2627 use crate::options::Flavor;
2628 use crate::parser::inlines::inline_ir::DelimChar;
2629 use std::sync::Arc;
2630
2631 fn cm_opts() -> ParserOptions {
2632 let flavor = Flavor::CommonMark;
2633 ParserOptions {
2634 flavor,
2635 dialect: crate::options::Dialect::for_flavor(flavor),
2636 extensions: crate::options::Extensions::for_flavor(flavor),
2637 pandoc_compat: crate::options::PandocCompat::default(),
2638 refdef_labels: None,
2639 }
2640 }
2641
2642 fn refdefs<I: IntoIterator<Item = &'static str>>(labels: I) -> RefdefMap {
2643 Arc::new(labels.into_iter().map(|s| s.to_string()).collect())
2644 }
2645
2646 #[test]
2647 fn ir_event_range_covers_all_variants() {
2648 let txt = IrEvent::Text { start: 0, end: 5 };
2649 assert_eq!(txt.range(), (0, 5));
2650
2651 let close = IrEvent::CloseBracket {
2652 pos: 7,
2653 matched: false,
2654 };
2655 assert_eq!(close.range(), (7, 8));
2656
2657 let open = IrEvent::OpenBracket {
2658 start: 1,
2659 end: 3,
2660 is_image: true,
2661 active: true,
2662 resolution: None,
2663 };
2664 assert_eq!(open.range(), (1, 3));
2665 }
2666
2667 #[test]
2668 fn scan_records_text_and_delim_run() {
2669 let opts = cm_opts();
2670 let ir = build_ir("foo *bar*", 0, 9, &opts);
2671 // Expect: Text "foo ", DelimRun "*", Text "bar", DelimRun "*"
2672 assert!(matches!(ir[0], IrEvent::Text { start: 0, end: 4 }));
2673 assert!(matches!(
2674 ir[1],
2675 IrEvent::DelimRun {
2676 ch: b'*',
2677 start: 4,
2678 end: 5,
2679 ..
2680 }
2681 ));
2682 assert!(matches!(ir[2], IrEvent::Text { start: 5, end: 8 }));
2683 assert!(matches!(
2684 ir[3],
2685 IrEvent::DelimRun {
2686 ch: b'*',
2687 start: 8,
2688 end: 9,
2689 ..
2690 }
2691 ));
2692 }
2693
2694 #[test]
2695 fn scan_records_brackets() {
2696 let opts = cm_opts();
2697 let ir = build_ir("[foo]", 0, 5, &opts);
2698 assert!(matches!(
2699 ir[0],
2700 IrEvent::OpenBracket {
2701 start: 0,
2702 end: 1,
2703 is_image: false,
2704 ..
2705 }
2706 ));
2707 assert!(matches!(ir[1], IrEvent::Text { start: 1, end: 4 }));
2708 assert!(matches!(
2709 ir[2],
2710 IrEvent::CloseBracket {
2711 pos: 4,
2712 matched: false
2713 }
2714 ));
2715 }
2716
2717 #[test]
2718 fn scan_records_image_bracket() {
2719 let opts = cm_opts();
2720 let ir = build_ir("![alt]", 0, 6, &opts);
2721 assert!(matches!(
2722 ir[0],
2723 IrEvent::OpenBracket {
2724 start: 0,
2725 end: 2,
2726 is_image: true,
2727 ..
2728 }
2729 ));
2730 }
2731
2732 #[test]
2733 fn scan_handles_code_span_opacity() {
2734 let opts = cm_opts();
2735 let ir = build_ir("a `*x*` b", 0, 9, &opts);
2736 // Code span `*x*` should be a Construct, NOT delim runs.
2737 let has_delim_run = ir.iter().any(|e| matches!(e, IrEvent::DelimRun { .. }));
2738 assert!(
2739 !has_delim_run,
2740 "code span content should not produce delim runs"
2741 );
2742 assert!(ir.iter().any(|e| matches!(
2743 e,
2744 IrEvent::Construct {
2745 kind: ConstructKind::CodeSpan,
2746 ..
2747 }
2748 )));
2749 }
2750
2751 #[test]
2752 fn process_emphasis_simple_pair() {
2753 let opts = cm_opts();
2754 let mut ir = build_ir("*foo*", 0, 5, &opts);
2755 process_emphasis(&mut ir, opts.dialect);
2756 // First DelimRun (open) gets a match.
2757 let opener = ir
2758 .iter()
2759 .find(|e| matches!(e, IrEvent::DelimRun { start: 0, .. }))
2760 .unwrap();
2761 if let IrEvent::DelimRun { matches, .. } = opener {
2762 assert_eq!(matches.len(), 1);
2763 assert!(matches[0].is_opener);
2764 assert_eq!(matches[0].kind, EmphasisKind::Emph);
2765 }
2766 }
2767
2768 #[test]
2769 fn brackets_resolve_inline_link() {
2770 let opts = cm_opts();
2771 let mut ir = build_ir("[foo](/url)", 0, 11, &opts);
2772 process_brackets(&mut ir, "[foo](/url)", None, opts.dialect);
2773 let open = ir
2774 .iter()
2775 .find(|e| matches!(e, IrEvent::OpenBracket { start: 0, .. }))
2776 .unwrap();
2777 if let IrEvent::OpenBracket { resolution, .. } = open {
2778 let r = resolution.as_ref().expect("inline link resolved");
2779 assert!(matches!(r.kind, LinkKind::Inline { .. }));
2780 if let LinkKind::Inline { dest, .. } = &r.kind {
2781 assert_eq!(dest, "/url");
2782 }
2783 }
2784 }
2785
2786 #[test]
2787 fn brackets_shortcut_resolves_only_with_refdef() {
2788 let opts = cm_opts();
2789 let text = "[foo]";
2790 let map = refdefs(["foo"]);
2791 let mut ir = build_ir(text, 0, text.len(), &opts);
2792 process_brackets(&mut ir, text, Some(&map), opts.dialect);
2793 let open = ir
2794 .iter()
2795 .find(|e| matches!(e, IrEvent::OpenBracket { start: 0, .. }))
2796 .unwrap();
2797 if let IrEvent::OpenBracket { resolution, .. } = open {
2798 assert!(matches!(
2799 resolution.as_ref().unwrap().kind,
2800 LinkKind::ShortcutReference
2801 ));
2802 }
2803 }
2804
2805 #[test]
2806 fn brackets_shortcut_falls_through_without_refdef() {
2807 // CMark example #523 mechanic: `[bar* baz]` is not a refdef, so
2808 // it must NOT resolve as a link — the brackets stay literal so
2809 // the inner `*` becomes available to the outer emphasis scanner.
2810 let opts = cm_opts();
2811 let text = "[bar* baz]";
2812 let mut ir = build_ir(text, 0, text.len(), &opts);
2813 process_brackets(&mut ir, text, None, opts.dialect);
2814 let open = ir
2815 .iter()
2816 .find(|e| matches!(e, IrEvent::OpenBracket { start: 0, .. }))
2817 .unwrap();
2818 if let IrEvent::OpenBracket { resolution, .. } = open {
2819 assert!(resolution.is_none(), "no refdef → bracket stays literal");
2820 }
2821 }
2822
2823 /// Spec #473: `*[bar*](/url)`. The link `[bar*](/url)` resolves; the
2824 /// outer `*...*` MUST NOT pair across the link's bracket boundary,
2825 /// because the inner `*` belongs to the link text.
2826 #[test]
2827 fn full_plans_emphasis_does_not_cross_resolved_link_boundary() {
2828 let opts = cm_opts();
2829 let text = "*[bar*](/url)";
2830 let plans = build_full_plans(text, 0, text.len(), &opts);
2831 // The leading `*` (at byte 0) must NOT be matched as an emphasis
2832 // opener — there's no closer outside the link, and the inner `*`
2833 // (at byte 5) is inside the resolved link's text range so it must
2834 // not be paired with byte 0.
2835 assert!(
2836 matches!(plans.emphasis.lookup(0), Some(DelimChar::Literal) | None),
2837 "outer `*` at byte 0 must not pair across link boundary, got {:?}",
2838 plans.emphasis.lookup(0)
2839 );
2840 // The link `[bar*](/url)` must resolve (opener at byte 1).
2841 assert!(
2842 matches!(plans.brackets.lookup(1), Some(BracketDispo::Open { .. })),
2843 "link [bar*](/url) must resolve at byte 1"
2844 );
2845 }
2846
2847 /// Spec #533: `[foo *bar [baz][ref]*][ref]` with `[ref]: /uri`.
2848 /// Inner `[baz][ref]` resolves as a link; §6.3 link-in-link rule
2849 /// deactivates the outer `[foo ...][ref]` so it falls through to
2850 /// literal brackets. Emphasis `*bar [baz][ref]*` wraps the inner link.
2851 #[test]
2852 fn full_plans_link_in_link_suppression_for_reference_links() {
2853 let opts = cm_opts();
2854 let text = "[foo *bar [baz][ref]*][ref]";
2855 let mut opts_with_refs = opts.clone();
2856 let labels: HashSet<String> = ["ref".to_string()].into_iter().collect();
2857 opts_with_refs.refdef_labels = Some(std::sync::Arc::new(labels));
2858 let plans = build_full_plans(text, 0, text.len(), &opts_with_refs);
2859
2860 // Inner `[baz][ref]` opener is at byte 10 — must resolve.
2861 assert!(
2862 matches!(plans.brackets.lookup(10), Some(BracketDispo::Open { .. })),
2863 "inner [baz][ref] must resolve at byte 10, got {:?}",
2864 plans.brackets.lookup(10)
2865 );
2866 // Outer `[foo ...][ref]` opener is at byte 0 — must NOT resolve
2867 // (link-in-link suppression).
2868 assert!(
2869 matches!(plans.brackets.lookup(0), Some(BracketDispo::Literal) | None),
2870 "outer [foo ...][ref] must fall through to literal at byte 0, got {:?}",
2871 plans.brackets.lookup(0)
2872 );
2873 // Trailing `[ref]` after the outer `]` is at byte 22 — it's a
2874 // standalone shortcut reference and must resolve.
2875 assert!(
2876 matches!(plans.brackets.lookup(22), Some(BracketDispo::Open { .. })),
2877 "trailing [ref] must resolve at byte 22, got {:?}",
2878 plans.brackets.lookup(22)
2879 );
2880 // Emphasis `*...*` at bytes 5 and 20 must pair — the scoped
2881 // emphasis pass over the (deactivated) outer bracket's inner
2882 // event range pairs these.
2883 assert!(
2884 matches!(plans.emphasis.lookup(5), Some(DelimChar::Open { .. })),
2885 "emphasis opener at byte 5 must pair, got {:?}",
2886 plans.emphasis.lookup(5)
2887 );
2888 }
2889}