panache_parser/parser/inlines/inline_ir.rs
1//! Inline IR for both CommonMark and Pandoc dialects.
2//!
3//! The inline parsing pipeline runs in three passes over an intermediate
4//! representation (IR):
5//!
6//! 1. **Scan** ([`build_ir`]): walk the source bytes once, producing a flat
7//! [`Vec<IrEvent>`]. Opaque higher-precedence constructs (escapes, code
8//! spans, autolinks, raw HTML, plus Pandoc math / native spans / inline
9//! footnotes / footnote references / citations / bracketed spans) are
10//! skipped past as a single [`IrEvent::Construct`] event whose source
11//! range is preserved for losslessness. Delimiter runs (`*`/`_`),
12//! bracket markers (`[`, `![`, `]`), soft line breaks, and plain text
13//! spans become distinct events.
14//!
15//! 2. **Process brackets** ([`process_brackets`]) — CommonMark §6.3: the
16//! bracket-stack algorithm walks `]` markers left-to-right. For each
17//! `]`, the algorithm finds the nearest active opener and tries to
18//! resolve the pair as a link or image: inline `[text](dest)`, full
19//! reference `[text][label]`, collapsed `[text][]`, or shortcut
20//! `[text]`. Under CommonMark, reference forms are validated against
21//! the document refdef map and a successful match deactivates all
22//! earlier active openers (§6.3 "links may not contain other links").
23//! Under Pandoc, reference forms resolve shape-only (any non-empty
24//! label) and the deactivation pass is skipped; outer-wins nested-link
25//! semantics are enforced by the emission walk's `suppress_inner_links`
26//! flag instead.
27//!
28//! 3. **Process emphasis** ([`process_emphasis_in_range`]): the classic
29//! delimiter-stack algorithm runs over the [`IrEvent::DelimRun`]
30//! events, pairing openers with closers and recording matches on the
31//! runs. Runs first scoped per resolved bracket pair (innermost
32//! first), then a top-level pass over the residual events. Each match
33//! consumes 1 or 2 inner-edge bytes from each side; leftover bytes
34//! fall through to literal text. Dialect gates (Pandoc flanking rules,
35//! mod-3 rejection, asymmetric (1,2)/(2,1) rejection, opener-count >= 4
36//! rejection, triple-emph nesting flip, cascade-then-rerun) branch on
37//! the `dialect` parameter.
38//!
39//! The emission walk in [`super::core::parse_inline_range_impl`] consumes
40//! three byte-keyed plans built by [`build_full_plans`]: an
41//! [`EmphasisPlan`] for delim-run dispositions, a [`BracketPlan`] for
42//! resolved link/image bracket pairs, and a [`ConstructPlan`] for
43//! standalone Pandoc constructs (inline footnotes, native spans, footnote
44//! references, citations, bracketed spans). Matched delim runs become
45//! `EMPHASIS` / `STRONG` nodes; matched bracket pairs become `LINK` /
46//! `IMAGE` nodes via the dispatcher's `try_parse_*` recognizers (called
47//! to *parse* a matched range, not to *resolve* it). Unmatched delims and
48//! brackets fall through to plain text.
49
50use crate::options::ParserOptions;
51use crate::parser::inlines::refdef_map::{RefdefMap, normalize_label};
52use std::collections::{BTreeMap, HashSet};
53
54#[derive(Debug, Clone, Copy, PartialEq, Eq)]
55pub enum EmphasisKind {
56 Emph,
57 Strong,
58}
59
60/// Disposition of a single delimiter byte after emphasis resolution.
61#[derive(Debug, Clone, Copy)]
62pub enum DelimChar {
63 /// Start of an opening marker. The marker spans `len` bytes from this
64 /// position; the matching closer starts at `partner` and spans
65 /// `partner_len` bytes.
66 Open {
67 len: u8,
68 partner: usize,
69 partner_len: u8,
70 kind: EmphasisKind,
71 },
72 /// Start of a closing marker. The matching opener starts at `partner`.
73 /// Emission jumps past close markers via the matching `Open` entry, so
74 /// this variant is only consulted defensively.
75 Close,
76 /// Unmatched delimiter byte; emit as literal text.
77 Literal,
78}
79
80/// Byte-keyed disposition map for `*` / `_` delimiter chars produced by
81/// the IR's emphasis pass and consumed by the inline emission walk.
82#[derive(Debug, Default, Clone)]
83pub struct EmphasisPlan {
84 by_pos: BTreeMap<usize, DelimChar>,
85}
86
87impl EmphasisPlan {
88 pub fn lookup(&self, pos: usize) -> Option<DelimChar> {
89 self.by_pos.get(&pos).copied()
90 }
91
92 pub fn is_empty(&self) -> bool {
93 self.by_pos.is_empty()
94 }
95
96 /// Construct an `EmphasisPlan` from a byte-keyed disposition map.
97 pub fn from_dispositions(by_pos: BTreeMap<usize, DelimChar>) -> Self {
98 Self { by_pos }
99 }
100}
101
102use super::bracketed_spans::try_parse_bracketed_span;
103use super::citations::{try_parse_bare_citation, try_parse_bracketed_citation};
104use super::code_spans::try_parse_code_span;
105use super::escapes::{EscapeType, try_parse_escape};
106use super::inline_footnotes::{try_parse_footnote_reference, try_parse_inline_footnote};
107use super::inline_html::try_parse_inline_html;
108use super::links::{
109 LinkScanContext, try_parse_autolink, try_parse_inline_image, try_parse_inline_link,
110 try_parse_reference_image, try_parse_reference_link,
111};
112use super::math::{
113 try_parse_display_math, try_parse_double_backslash_display_math,
114 try_parse_double_backslash_inline_math, try_parse_gfm_inline_math, try_parse_inline_math,
115 try_parse_single_backslash_display_math, try_parse_single_backslash_inline_math,
116};
117use super::native_spans::try_parse_native_span;
118
119/// One event in the inline IR.
120///
121/// Events partition the source byte range covered by the IR exactly: their
122/// `range()` values are contiguous and non-overlapping, so concatenating
123/// them reproduces the original input. This is the losslessness invariant
124/// the emission pass relies on.
125#[derive(Debug, Clone)]
126pub enum IrEvent {
127 /// Plain text byte span. Emitted as a single `TEXT` token, possibly
128 /// merged with adjacent literal-disposition delim/bracket bytes.
129 Text { start: usize, end: usize },
130
131 /// An opaque higher-precedence construct (escape, code span, autolink,
132 /// raw HTML). The emission pass re-parses these from the source byte
133 /// range using the existing per-construct emitters; we don't store a
134 /// pre-built `GreenNode` because `rowan::GreenNodeBuilder` doesn't
135 /// support inserting subtrees directly. The byte range is what makes
136 /// emission well-defined — the construct kind is recovered by the
137 /// emitter dispatching on the leading byte.
138 Construct {
139 start: usize,
140 end: usize,
141 kind: ConstructKind,
142 },
143
144 /// A `*` or `_` delimiter run. The `matches` vec is filled in by
145 /// [`process_emphasis`]; before that pass it is empty.
146 DelimRun {
147 ch: u8,
148 start: usize,
149 end: usize,
150 can_open: bool,
151 can_close: bool,
152 /// Matched fragments produced by `process_emphasis`. Each entry
153 /// is one `(byte_offset_within_run, len, partner_event_idx,
154 /// partner_byte_offset, kind, is_opener)` tuple. Empty until the
155 /// pass runs; possibly multiple entries when a single run matches
156 /// at multiple positions (e.g. a 4-run that closes 2+2 pairs).
157 matches: Vec<DelimMatch>,
158 },
159
160 /// `[` or `![` bracket marker. Resolved by [`process_brackets`].
161 OpenBracket {
162 start: usize,
163 /// `start + 1` for `[`, `start + 2` for `![`.
164 end: usize,
165 is_image: bool,
166 /// True until a later resolution rule deactivates this opener.
167 active: bool,
168 /// Filled in when the matching `CloseBracket` resolves the pair
169 /// to a link / image.
170 resolution: Option<BracketResolution>,
171 /// Pandoc-only: extents of an unresolved bracket-shape pattern
172 /// (full reference / collapsed / shortcut whose label doesn't
173 /// match a refdef). Mutually exclusive with `resolution:
174 /// Some(...)`. When `Some`, emission wraps `[start, end)` in
175 /// an `UNRESOLVED_REFERENCE` node so downstream tools can
176 /// attach behavior to the bracket-shape pattern. Always
177 /// `None` under `Dialect::CommonMark`.
178 unresolved_ref: Option<UnresolvedRefShape>,
179 },
180
181 /// `]` bracket marker. Resolved by [`process_brackets`].
182 CloseBracket {
183 pos: usize,
184 /// True if this `]` was paired with an opener and the pair was
185 /// turned into a link / image.
186 matched: bool,
187 },
188
189 /// A soft line break (a `\n` or `\r\n` ending a paragraph-internal
190 /// line). Includes the line-ending bytes verbatim.
191 SoftBreak { start: usize, end: usize },
192
193 /// A hard line break (` \n` / `\\\n` / ` \n` etc.). Includes any
194 /// trailing-space bytes plus the line ending.
195 HardBreak { start: usize, end: usize },
196}
197
198impl IrEvent {
199 /// The source byte range this event covers.
200 pub fn range(&self) -> (usize, usize) {
201 match self {
202 IrEvent::Text { start, end }
203 | IrEvent::Construct { start, end, .. }
204 | IrEvent::DelimRun { start, end, .. }
205 | IrEvent::OpenBracket { start, end, .. }
206 | IrEvent::SoftBreak { start, end }
207 | IrEvent::HardBreak { start, end } => (*start, *end),
208 IrEvent::CloseBracket { pos, .. } => (*pos, *pos + 1),
209 }
210 }
211}
212
213/// Categorical tag for a [`IrEvent::Construct`] event so emission knows
214/// which parser to call to rebuild the CST subtree.
215#[derive(Debug, Clone, Copy, PartialEq, Eq)]
216pub enum ConstructKind {
217 /// `\X` literal-character escape (CommonMark §2.4).
218 Escape,
219 /// `` `code` `` span (§6.1).
220 CodeSpan,
221 /// `<scheme://...>` or `<email@host>` (§6.5).
222 Autolink,
223 /// `<tag ...>` and friends (§6.6).
224 InlineHtml,
225 /// Pandoc opaque construct that doesn't have a dedicated kind yet
226 /// (currently: math spans). Pre-recognised in `build_ir` under
227 /// `Dialect::Pandoc` solely so the emphasis pass treats the entire
228 /// construct as opaque and delim runs inside don't cross its
229 /// boundary. Emission re-parses the construct via the dispatcher's
230 /// existing `try_parse_*` chain.
231 PandocOpaque,
232 /// Pandoc inline footnote `^[note text]`. Recognised in `build_ir`
233 /// under `Dialect::Pandoc` and consumed by the emission walk via
234 /// the IR's `ConstructPlan`. The dispatcher's legacy `^[` branch
235 /// is gated to CommonMark dialect only.
236 InlineFootnote,
237 /// Pandoc native span `<span ...>...</span>`. Recognised in
238 /// `build_ir` under `Dialect::Pandoc` and consumed by the emission
239 /// walk via the IR's `ConstructPlan`. The dispatcher's legacy
240 /// `<span>` branch is gated to CommonMark dialect only.
241 NativeSpan,
242 /// Pandoc footnote reference `[^id]`. Recognised in `build_ir`
243 /// under `Dialect::Pandoc` and consumed by the emission walk via
244 /// the IR's `ConstructPlan`. The dispatcher's legacy `[^id]`
245 /// branch is gated to CommonMark dialect only.
246 FootnoteReference,
247 /// Pandoc bracketed citation `[@key]`, `[see @key, p. 1]`,
248 /// `[@a; @b]`. Recognised in `build_ir` under `Dialect::Pandoc`
249 /// and consumed by the emission walk via the IR's `ConstructPlan`.
250 /// The dispatcher's legacy `[@cite]` branch is gated to CommonMark
251 /// dialect only.
252 BracketedCitation,
253 /// Pandoc bare citation `@key` or `-@key` (author-in-text /
254 /// suppress-author). Recognised in `build_ir` under
255 /// `Dialect::Pandoc` and consumed by the emission walk via the
256 /// IR's `ConstructPlan`. The dispatcher's legacy `@` and `-@`
257 /// branches are gated to CommonMark dialect only.
258 BareCitation,
259 /// Pandoc bracketed span `[content]{attrs}`. Recognised in
260 /// `build_ir` under `Dialect::Pandoc` and consumed by the emission
261 /// walk via the IR's `ConstructPlan`. The dispatcher's legacy
262 /// `[text]{attrs}` branch is gated to CommonMark dialect only.
263 BracketedSpan,
264 /// Pandoc wikilink `[[url]]` / `[[url|title]]` / `![[url]]` /
265 /// `![[url|title]]`. Recognised in `build_ir` when either
266 /// `wikilinks_title_after_pipe` or `wikilinks_title_before_pipe` is
267 /// enabled. Dialect-agnostic (pandoc accepts the extension on both
268 /// `markdown+` and `commonmark+`). The emission walk dispatches via
269 /// the IR's `ConstructPlan`; the `is_image` variant is recovered by
270 /// peeking the leading byte of the source range.
271 WikiLink,
272}
273
274/// One matched fragment within a [`IrEvent::DelimRun`].
275#[derive(Debug, Clone, Copy)]
276pub struct DelimMatch {
277 /// Byte offset of this fragment relative to the run's `start`.
278 pub offset_in_run: u8,
279 /// Number of bytes in this fragment (1 or 2).
280 pub len: u8,
281 /// Whether this fragment is the opener (`true`) or closer of the pair.
282 pub is_opener: bool,
283 /// IR event index of the partner run.
284 pub partner_event: u32,
285 /// Byte offset within the partner run of the partner fragment.
286 pub partner_offset: u8,
287 /// Emphasis kind (Emph for `len == 1`, Strong for `len == 2`).
288 pub kind: EmphasisKind,
289}
290
291/// Pandoc-only: extents of an unresolved bracket-shape reference
292/// pattern. Recorded on `IrEvent::OpenBracket.unresolved_ref` when the
293/// no-resolution fall-through fires under `Dialect::Pandoc`.
294#[derive(Debug, Clone, Copy, PartialEq, Eq)]
295pub struct UnresolvedRefShape {
296 /// IR event index of the matching `CloseBracket`. Used by the
297 /// scoped-emphasis pass to treat the wrapper as a tree boundary.
298 pub close_event: u32,
299 /// One past the end of the inner text (the byte position of the
300 /// outer `]`). Combined with the opener's `end` field, this is the
301 /// inner text range that goes through normal inline parsing.
302 pub text_end: usize,
303 /// One past the end of the full bracket-shape pattern. For
304 /// shortcut form `[text]`: `close_pos + 1`. For collapsed
305 /// `[text][]`: `close_pos + 3`. For full `[text][label]`: the byte
306 /// after the closing `]` of `[label]`.
307 pub end: usize,
308}
309
310/// Successful bracket resolution: the `[`...`]` pair is a link or image.
311#[derive(Debug, Clone)]
312pub struct BracketResolution {
313 /// IR event index of the matching `CloseBracket`.
314 pub close_event: u32,
315 /// Source range of the link text (between `[`/`![` and `]`).
316 pub text_start: usize,
317 pub text_end: usize,
318 /// Source range of the link suffix (`(...)`, `[label]`, `[]`, or
319 /// empty for shortcut). When `kind == ShortcutReference`,
320 /// `suffix_start == suffix_end == close_pos + 1`.
321 pub suffix_start: usize,
322 pub suffix_end: usize,
323 pub kind: LinkKind,
324}
325
326/// What kind of link/image we resolved a bracket pair to.
327#[derive(Debug, Clone)]
328pub enum LinkKind {
329 /// `[text](dest)` or `[text](dest "title")`.
330 Inline { dest: String, title: Option<String> },
331 /// `[text][label]` — explicit reference.
332 FullReference { label: String },
333 /// `[text][]` — collapsed reference. Label is the link text.
334 CollapsedReference,
335 /// `[text]` — shortcut reference. Label is the link text.
336 ShortcutReference,
337}
338
339// ============================================================================
340// Pass 1: Scan
341// ============================================================================
342
343/// Scan `text[start..end]` once, producing a flat IR of events.
344///
345/// The scan is forward-only and never backtracks: each iteration either
346/// consumes a known construct (escape, code span, autolink, raw HTML),
347/// records a delim run / bracket marker / line break, or steps past a
348/// single UTF-8 boundary as plain text. Adjacent text bytes are coalesced
349/// into a single [`IrEvent::Text`] event by the run-flush step.
350pub fn build_ir(text: &str, start: usize, end: usize, config: &ParserOptions) -> Vec<IrEvent> {
351 let mut events = Vec::new();
352 build_ir_into(text, start, end, config, &mut events);
353 events
354}
355
356/// Like [`build_ir`] but writes into a caller-provided `Vec<IrEvent>`,
357/// clearing it first. Used by [`build_full_plans`] to amortise the
358/// per-call allocation through a thread-local scratch pool.
359pub(super) fn build_ir_into(
360 text: &str,
361 start: usize,
362 end: usize,
363 config: &ParserOptions,
364 events: &mut Vec<IrEvent>,
365) {
366 events.clear();
367 let bytes = text.as_bytes();
368 let exts = &config.extensions;
369 let is_commonmark = config.dialect == crate::options::Dialect::CommonMark;
370
371 let mut pos = start;
372 let mut text_run_start = start;
373 // Pandoc-only: extent of the current bracket-shape link/image's
374 // opaque range. While `pos < pandoc_bracket_extent`, autolinks /
375 // raw HTML / native spans are NOT recognised — pandoc-native
376 // treats `[link text]` as opaque to those constructs (CommonMark
377 // spec example #526 / #538). The lookahead at `[`/`![` sets this
378 // when a bracket-shape forms a valid link/image; once `pos`
379 // passes the extent, normal scanning resumes. CommonMark
380 // dialect's link-text-vs-autolink ordering is handled by the
381 // dispatcher's `try_parse_inline_link` rejecting outer matches
382 // when the link text contains a valid autolink (a different
383 // mechanism, see `LinkScanContext.skip_autolinks`).
384 let mut pandoc_bracket_extent: usize = 0;
385
386 // Pre-computed byte mask: `mask[b]` is `true` iff byte `b` could
387 // start any IR-recognised construct under the current dialect /
388 // extensions. Used to bulk-skip plain bytes between structural
389 // bytes — the per-byte branch chain below only runs at positions
390 // where a construct is actually possible. Non-ASCII bytes
391 // (>= 0x80) are never structural and are skipped together with
392 // ASCII plain text.
393 let mask = build_ir_byte_mask(config);
394
395 macro_rules! flush_text {
396 () => {
397 if pos > text_run_start {
398 events.push(IrEvent::Text {
399 start: text_run_start,
400 end: pos,
401 });
402 }
403 };
404 }
405
406 while pos < end {
407 // Fast-skip plain bytes. `text_run_start` is preserved across
408 // the skip so the next structural-event flush picks them up.
409 while pos < end && !mask[bytes[pos] as usize] {
410 pos += 1;
411 }
412 if pos >= end {
413 break;
414 }
415 let b = bytes[pos];
416
417 // Pandoc-only: at `[` or `![`, look ahead to see if this
418 // bracket-shape forms a valid link/image. If so, suppress
419 // autolink / raw HTML / native span recognition until `pos`
420 // passes the bracket-shape's end. Skipped if we're already
421 // inside an enclosing bracket-shape's opaque range.
422 if !is_commonmark
423 && pos >= pandoc_bracket_extent
424 && (b == b'[' || (b == b'!' && pos + 1 < end && bytes[pos + 1] == b'['))
425 && let Some(len) = try_pandoc_bracket_link_extent(text, pos, end, config)
426 {
427 pandoc_bracket_extent = pos + len;
428 }
429 let in_pandoc_bracket = !is_commonmark && pos < pandoc_bracket_extent;
430
431 // Backslash escape (§2.4) — including `\\\n` hard line break.
432 if b == b'\\'
433 && let Some((len, _ch, escape_type)) = try_parse_escape(&text[pos..])
434 && pos + len <= end
435 {
436 let enabled = match escape_type {
437 EscapeType::Literal => is_commonmark || exts.all_symbols_escapable,
438 EscapeType::HardLineBreak => exts.escaped_line_breaks,
439 EscapeType::NonbreakingSpace => exts.all_symbols_escapable,
440 };
441 if enabled {
442 flush_text!();
443 let kind = match escape_type {
444 EscapeType::HardLineBreak => {
445 events.push(IrEvent::HardBreak {
446 start: pos,
447 end: pos + len,
448 });
449 pos += len;
450 text_run_start = pos;
451 continue;
452 }
453 EscapeType::Literal | EscapeType::NonbreakingSpace => ConstructKind::Escape,
454 };
455 events.push(IrEvent::Construct {
456 start: pos,
457 end: pos + len,
458 kind,
459 });
460 pos += len;
461 text_run_start = pos;
462 continue;
463 }
464 }
465
466 // Code span (§6.1) — opaque to emphasis and brackets.
467 if b == b'`'
468 && let Some((len, _, _, _)) = try_parse_code_span(&text[pos..])
469 && pos + len <= end
470 {
471 flush_text!();
472 events.push(IrEvent::Construct {
473 start: pos,
474 end: pos + len,
475 kind: ConstructKind::CodeSpan,
476 });
477 pos += len;
478 text_run_start = pos;
479 continue;
480 }
481
482 // Pandoc-only: math spans are opaque to emphasis. The legacy
483 // `parse_until_closer_with_nested_*` skip-list includes inline
484 // math; without recognising it here, delim runs inside `$math$`
485 // would be picked up by the emphasis pass and break losslessness
486 // (the dispatcher's math parser would later re-claim the bytes,
487 // duplicating content).
488 if !is_commonmark && let Some(len) = try_pandoc_math_opaque(text, pos, end, config) {
489 flush_text!();
490 events.push(IrEvent::Construct {
491 start: pos,
492 end: pos + len,
493 kind: ConstructKind::PandocOpaque,
494 });
495 pos += len;
496 text_run_start = pos;
497 continue;
498 }
499
500 // Pandoc-only: native span `<span ...>...</span>`. Must come
501 // before the generic autolink/raw-html branches so the open tag
502 // doesn't get claimed as inline HTML. Span content is opaque to
503 // the emphasis pass; emission consumes the event via the IR's
504 // `ConstructPlan`. Suppressed inside Pandoc bracket-shape
505 // link/image text.
506 if !is_commonmark
507 && !in_pandoc_bracket
508 && b == b'<'
509 && exts.native_spans
510 && let Some((len, _, _)) = try_parse_native_span(&text[pos..])
511 && pos + len <= end
512 {
513 flush_text!();
514 events.push(IrEvent::Construct {
515 start: pos,
516 end: pos + len,
517 kind: ConstructKind::NativeSpan,
518 });
519 pos += len;
520 text_run_start = pos;
521 continue;
522 }
523
524 // Autolink (§6.5) before raw HTML — autolinks are the more
525 // specific shape inside `<...>`. Both are suppressed inside
526 // Pandoc bracket-shape link/image text (pandoc-native treats
527 // link text as opaque to autolinks and raw HTML).
528 if b == b'<' && !in_pandoc_bracket {
529 if exts.autolinks
530 && let Some((len, _)) = try_parse_autolink(&text[pos..], is_commonmark)
531 && pos + len <= end
532 {
533 flush_text!();
534 events.push(IrEvent::Construct {
535 start: pos,
536 end: pos + len,
537 kind: ConstructKind::Autolink,
538 });
539 pos += len;
540 text_run_start = pos;
541 continue;
542 }
543 if exts.raw_html
544 && let Some(len) = try_parse_inline_html(&text[pos..], config.dialect)
545 && pos + len <= end
546 {
547 flush_text!();
548 events.push(IrEvent::Construct {
549 start: pos,
550 end: pos + len,
551 kind: ConstructKind::InlineHtml,
552 });
553 pos += len;
554 text_run_start = pos;
555 continue;
556 }
557 }
558
559 // Pandoc-only: inline footnote `^[note]`. Recognized at scan
560 // time so the emphasis pass treats it as opaque (delim runs
561 // inside the footnote can't pair with delim runs outside).
562 if !is_commonmark
563 && b == b'^'
564 && exts.inline_footnotes
565 && let Some((len, _)) = try_parse_inline_footnote(&text[pos..])
566 && pos + len <= end
567 {
568 flush_text!();
569 events.push(IrEvent::Construct {
570 start: pos,
571 end: pos + len,
572 kind: ConstructKind::InlineFootnote,
573 });
574 pos += len;
575 text_run_start = pos;
576 continue;
577 }
578
579 // Pandoc-only: footnote reference `[^id]`. Recognised at scan
580 // time so the emphasis pass treats it as opaque (delim runs
581 // inside the label can't pair with delim runs outside) and the
582 // emission walk dispatches it directly via the IR's
583 // `ConstructPlan`. Must come before the generic bracket-opaque
584 // scan so the dedicated kind wins.
585 if !is_commonmark
586 && b == b'['
587 && pos + 1 < end
588 && bytes[pos + 1] == b'^'
589 && exts.footnotes
590 && let Some((len, _)) = try_parse_footnote_reference(&text[pos..])
591 && pos + len <= end
592 {
593 flush_text!();
594 events.push(IrEvent::Construct {
595 start: pos,
596 end: pos + len,
597 kind: ConstructKind::FootnoteReference,
598 });
599 pos += len;
600 text_run_start = pos;
601 continue;
602 }
603
604 // Pandoc-only: bracketed citation `[@cite]`. Recognised at
605 // scan time so the emphasis pass treats it as opaque (delim
606 // runs inside the citation can't pair with delim runs outside)
607 // and the emission walk dispatches it directly via the IR's
608 // `ConstructPlan`. Must come before the generic bracket-opaque
609 // scan so the dedicated kind wins.
610 if !is_commonmark
611 && b == b'['
612 && exts.citations
613 && let Some((len, _)) = try_parse_bracketed_citation(&text[pos..])
614 && pos + len <= end
615 {
616 flush_text!();
617 events.push(IrEvent::Construct {
618 start: pos,
619 end: pos + len,
620 kind: ConstructKind::BracketedCitation,
621 });
622 pos += len;
623 text_run_start = pos;
624 continue;
625 }
626
627 // Pandoc-only: bare citation `@key` or `-@key`. Recognised at
628 // scan time so the emission walk dispatches it directly via
629 // the IR's `ConstructPlan`. Bare citations don't contain
630 // emphasis-eligible content, so opacity is moot here — IR
631 // participation is only for dispatch consolidation.
632 if !is_commonmark
633 && (b == b'@' || (b == b'-' && pos + 1 < end && bytes[pos + 1] == b'@'))
634 && (exts.citations || exts.quarto_crossrefs)
635 && let Some((len, _, _)) = try_parse_bare_citation(&text[pos..])
636 && pos + len <= end
637 {
638 flush_text!();
639 events.push(IrEvent::Construct {
640 start: pos,
641 end: pos + len,
642 kind: ConstructKind::BareCitation,
643 });
644 pos += len;
645 text_run_start = pos;
646 continue;
647 }
648
649 // Pandoc-only: bracketed span `[content]{attrs}`. Recognised
650 // at scan time so the emphasis pass treats it as opaque (delim
651 // runs inside the span content can't pair with delim runs
652 // outside) and the emission walk dispatches it directly via
653 // the IR's `ConstructPlan`. Must come before the generic
654 // bracket-opaque scan so the dedicated kind wins.
655 // `try_parse_bracketed_span` requires `]` to be immediately
656 // followed by `{`, so this never shadows inline links
657 // (`[text](url)`) or reference links (`[label][refdef]`) —
658 // those don't have the `{attrs}` suffix.
659 if !is_commonmark
660 && b == b'['
661 && exts.bracketed_spans
662 && let Some((len, _, _)) = try_parse_bracketed_span(&text[pos..])
663 && pos + len <= end
664 {
665 flush_text!();
666 events.push(IrEvent::Construct {
667 start: pos,
668 end: pos + len,
669 kind: ConstructKind::BracketedSpan,
670 });
671 pos += len;
672 text_run_start = pos;
673 continue;
674 }
675
676 // Wikilinks `[[url]]`, `[[url|title]]`, `![[url]]`,
677 // `![[url|title]]`. Recognised on either pipe-order extension
678 // and on both dialects (pandoc accepts the extension under both
679 // `markdown+` and `commonmark+`). Must precede the `` form, or `reference_links` for the
701 // `![alt][label]` reference-image form (e.g. MultiMarkdown
702 // disables `inline_images` but uses reference images).
703 if b == b'!'
704 && pos + 1 < end
705 && bytes[pos + 1] == b'['
706 && (exts.inline_images || exts.reference_links)
707 {
708 flush_text!();
709 events.push(IrEvent::OpenBracket {
710 start: pos,
711 end: pos + 2,
712 is_image: true,
713 active: true,
714 resolution: None,
715 unresolved_ref: None,
716 });
717 pos += 2;
718 text_run_start = pos;
719 continue;
720 }
721
722 // `[` opens a link bracket. Recognised whenever any
723 // link-producing extension is on — `inline_links` for
724 // `[text](url)`, or `reference_links` for `[text][label]` /
725 // `[text]` shortcut form.
726 if b == b'[' && (exts.inline_links || exts.reference_links) {
727 flush_text!();
728 events.push(IrEvent::OpenBracket {
729 start: pos,
730 end: pos + 1,
731 is_image: false,
732 active: true,
733 resolution: None,
734 unresolved_ref: None,
735 });
736 pos += 1;
737 text_run_start = pos;
738 continue;
739 }
740
741 // `]` closes a link/image bracket.
742 if b == b']' {
743 flush_text!();
744 events.push(IrEvent::CloseBracket {
745 pos,
746 matched: false,
747 });
748 pos += 1;
749 text_run_start = pos;
750 continue;
751 }
752
753 // `*` or `_` delimiter run.
754 if b == b'*' || b == b'_' {
755 flush_text!();
756 let mut run_end = pos;
757 while run_end < end && bytes[run_end] == b {
758 run_end += 1;
759 }
760 let count = run_end - pos;
761 let (can_open, can_close) = compute_flanking(text, pos, count, b, config.dialect);
762 events.push(IrEvent::DelimRun {
763 ch: b,
764 start: pos,
765 end: run_end,
766 can_open,
767 can_close,
768 matches: Vec::new(),
769 });
770 pos = run_end;
771 text_run_start = pos;
772 continue;
773 }
774
775 // Hard line break: 2+ trailing spaces before newline. We detect
776 // this when we're sitting on a `\n` (or `\r\n`) and the preceding
777 // bytes within the current text run are spaces.
778 if b == b'\n' || (b == b'\r' && pos + 1 < end && bytes[pos + 1] == b'\n') {
779 // Count trailing spaces in the text accumulated so far.
780 let nl_len = if b == b'\r' { 2 } else { 1 };
781 let mut trailing_spaces = 0;
782 let mut s = pos;
783 while s > text_run_start && bytes[s - 1] == b' ' {
784 trailing_spaces += 1;
785 s -= 1;
786 }
787 if trailing_spaces >= 2 {
788 // Flush text *before* the trailing spaces.
789 if s > text_run_start {
790 events.push(IrEvent::Text {
791 start: text_run_start,
792 end: s,
793 });
794 }
795 events.push(IrEvent::HardBreak {
796 start: s,
797 end: pos + nl_len,
798 });
799 pos += nl_len;
800 text_run_start = pos;
801 continue;
802 }
803
804 // Soft line break: flush preceding text, emit the line ending
805 // as its own event so the emitter can render `NEWLINE` tokens
806 // verbatim.
807 flush_text!();
808 events.push(IrEvent::SoftBreak {
809 start: pos,
810 end: pos + nl_len,
811 });
812 pos += nl_len;
813 text_run_start = pos;
814 continue;
815 }
816
817 // Plain byte — advance one UTF-8 char.
818 let ch_len = text[pos..]
819 .chars()
820 .next()
821 .map_or(1, std::primitive::char::len_utf8);
822 pos += ch_len.max(1);
823 }
824
825 flush_text!();
826}
827
828/// Build a 256-entry mask: `mask[b]` is `true` iff byte `b` could start
829/// any IR-recognised construct under the current dialect / extensions.
830///
831/// This is the build-IR-specific superset of "is this byte interesting".
832/// Plain bytes between structural bytes are bulk-skipped via this mask
833/// in the [`build_ir`] hot loop; missing a byte here is a correctness
834/// bug (we'd skip past a real construct), but having extras only costs
835/// us a wasted branch round-trip.
836fn build_ir_byte_mask(config: &ParserOptions) -> [bool; 256] {
837 let mut mask = [false; 256];
838 let exts = &config.extensions;
839 let is_commonmark = config.dialect == crate::options::Dialect::CommonMark;
840
841 // Always structural for IR scanning:
842 // `\n` / `\r` — soft / hard breaks
843 // `\\` — escape, hard line break, backslash math
844 // `` ` `` — code span (IR construct)
845 // `*` / `_` — emphasis delim runs (IR core)
846 mask[b'\n' as usize] = true;
847 mask[b'\r' as usize] = true;
848 mask[b'\\' as usize] = true;
849 mask[b'`' as usize] = true;
850 mask[b'*' as usize] = true;
851 mask[b'_' as usize] = true;
852
853 // Brackets: scanned whenever any bracket-shaped construct is
854 // reachable. `]` is structural unconditionally if `[` is — the IR
855 // emits a CloseBracket event regardless of which opener variant
856 // matches. `!` is gated on image-producing extensions; the leading
857 // `!` of `![alt]` is the only image entry point.
858 if exts.inline_links
859 || exts.reference_links
860 || exts.inline_images
861 || exts.bracketed_spans
862 || exts.footnotes
863 || exts.citations
864 {
865 mask[b'[' as usize] = true;
866 mask[b']' as usize] = true;
867 }
868 if exts.inline_images || exts.reference_links {
869 mask[b'!' as usize] = true;
870 }
871
872 // `<` covers autolinks, raw HTML, and Pandoc native spans.
873 if exts.autolinks || exts.raw_html || (!is_commonmark && exts.native_spans) {
874 mask[b'<' as usize] = true;
875 }
876
877 // `^` covers Pandoc inline footnotes (`^[...]` recognised in IR
878 // under Pandoc dialect). CM dialect inline footnotes go through
879 // the dispatcher, not the IR.
880 if !is_commonmark && exts.inline_footnotes {
881 mask[b'^' as usize] = true;
882 }
883
884 // `@` covers Pandoc bare citation `@key` and `[@cite]`. The leading
885 // `[` of `[@cite]` is already in the mask via the bracket gate;
886 // gating `@` here also covers the bare-citation form.
887 if !is_commonmark && (exts.citations || exts.quarto_crossrefs) {
888 mask[b'@' as usize] = true;
889 // `-` only matters as the first byte of `-@cite`. Tracking it
890 // here avoids missing the suppress-author bare citation form.
891 mask[b'-' as usize] = true;
892 }
893
894 // `$` covers Pandoc dollar / GFM math. CM doesn't recognise math
895 // in `build_ir`.
896 if !is_commonmark
897 && (exts.tex_math_dollars
898 || exts.tex_math_gfm
899 || exts.tex_math_single_backslash
900 || exts.tex_math_double_backslash)
901 {
902 mask[b'$' as usize] = true;
903 }
904
905 mask
906}
907
908// ============================================================================
909// Flanking (CommonMark §6.2)
910// ============================================================================
911
912fn compute_flanking(
913 text: &str,
914 pos: usize,
915 count: usize,
916 ch: u8,
917 dialect: crate::options::Dialect,
918) -> (bool, bool) {
919 if dialect == crate::options::Dialect::Pandoc {
920 // Pandoc-markdown's recursive-descent emphasis parser does NOT
921 // apply CommonMark §6.2 flanking rules. Instead it gates on:
922 // - opener: must not be followed by whitespace (Pandoc
923 // `try_parse_emphasis` line 247 in legacy core.rs).
924 // - closer: no flanking gate at all (Pandoc-markdown's
925 // `ender` parser only counts characters; see Markdown.hs
926 // in pandoc/src/Text/Pandoc/Readers/Markdown.hs).
927 // - underscore intraword hard rule: `_` adjacent to an
928 // alphanumeric on either side cannot open / close
929 // (Pandoc's `intraword_underscores` extension default).
930 let prev_char = (pos > 0).then(|| text[..pos].chars().last()).flatten();
931 let next_char = text.get(pos + count..).and_then(|s| s.chars().next());
932 let followed_by_ws = next_char.is_none_or(|c| c.is_whitespace());
933
934 let mut can_open = !followed_by_ws;
935 // Pandoc-markdown's `ender` (in pandoc/Readers/Markdown.hs)
936 // has no flanking restriction on closers — just a count match.
937 // Set can_close unconditionally; the per-pair match logic in
938 // `process_emphasis_in_range_filtered` constrains pairing via
939 // the equal-count rule.
940 let mut can_close = true;
941
942 if ch == b'_' {
943 let prev_is_alnum = prev_char.is_some_and(|c| c.is_alphanumeric());
944 let next_is_alnum = next_char.is_some_and(|c| c.is_alphanumeric());
945 if prev_is_alnum {
946 can_open = false;
947 }
948 if next_is_alnum {
949 can_close = false;
950 }
951 }
952
953 return (can_open, can_close);
954 }
955
956 // CommonMark §6.2 flanking.
957 let lf = is_left_flanking(text, pos, count);
958 let rf = is_right_flanking(text, pos, count);
959 if ch == b'*' {
960 (lf, rf)
961 } else {
962 let prev_char = (pos > 0).then(|| text[..pos].chars().last()).flatten();
963 let next_char = text.get(pos + count..).and_then(|s| s.chars().next());
964 let preceded_by_punct = prev_char.is_some_and(is_unicode_punct_or_symbol);
965 let followed_by_punct = next_char.is_some_and(is_unicode_punct_or_symbol);
966 let can_open = lf && (!rf || preceded_by_punct);
967 let can_close = rf && (!lf || followed_by_punct);
968 (can_open, can_close)
969 }
970}
971
972/// Pandoc-only: identify a math span starting at `pos` and return its
973/// byte length. Tries `$math$` and `$$display$$` (gated on
974/// `tex_math_dollars`), GFM `$math$` (gated on `tex_math_gfm`), and the
975/// `\(math\)` / `\[math\]` / `\\(math\\)` / `\\[math\\]` backslash
976/// forms (gated on `tex_math_single_backslash` / `_double_backslash`).
977/// Math content is opaque to emphasis: `$a * b$` must not produce an
978/// emphasis closer at the inner `*`.
979fn try_pandoc_math_opaque(
980 text: &str,
981 pos: usize,
982 end: usize,
983 config: &ParserOptions,
984) -> Option<usize> {
985 let bytes = text.as_bytes();
986 let exts = &config.extensions;
987 let b = bytes[pos];
988
989 if exts.tex_math_dollars && b == b'$' {
990 if let Some((len, _)) = try_parse_display_math(&text[pos..])
991 && pos + len <= end
992 {
993 return Some(len);
994 }
995 if let Some((len, _)) = try_parse_inline_math(&text[pos..])
996 && pos + len <= end
997 {
998 return Some(len);
999 }
1000 }
1001 if exts.tex_math_gfm
1002 && b == b'$'
1003 && let Some((len, _)) = try_parse_gfm_inline_math(&text[pos..])
1004 && pos + len <= end
1005 {
1006 return Some(len);
1007 }
1008 if exts.tex_math_double_backslash && b == b'\\' {
1009 if let Some((len, _)) = try_parse_double_backslash_display_math(&text[pos..])
1010 && pos + len <= end
1011 {
1012 return Some(len);
1013 }
1014 if let Some((len, _)) = try_parse_double_backslash_inline_math(&text[pos..])
1015 && pos + len <= end
1016 {
1017 return Some(len);
1018 }
1019 }
1020 if exts.tex_math_single_backslash && b == b'\\' {
1021 if let Some((len, _)) = try_parse_single_backslash_display_math(&text[pos..])
1022 && pos + len <= end
1023 {
1024 return Some(len);
1025 }
1026 if let Some((len, _)) = try_parse_single_backslash_inline_math(&text[pos..])
1027 && pos + len <= end
1028 {
1029 return Some(len);
1030 }
1031 }
1032 None
1033}
1034
1035/// Pandoc-only: identify a bracket-shaped opaque construct starting at
1036/// `pos` and return its byte length. Tries the dispatcher's precedence
1037/// order:
1038/// 1. `` inline image
1039/// 2. `![alt][ref]` / `![alt]` reference image (shape-only opacity)
1040/// 3. `[^id]` footnote reference
1041/// 4. `[text](dest)` inline link
1042/// 5. `[text][ref]` / `[text]` reference link (shape-only opacity)
1043/// 6. `[@cite]` bracketed citation
1044/// 7. `[text]{attrs}` bracketed span
1045///
1046/// Returns `None` if the bytes at `pos` don't open any recognised Pandoc
1047/// bracket-shaped construct. In that case the scanner falls through to
1048/// the generic `OpenBracket`/`CloseBracket` emission and the dispatcher
1049/// emits the bracket bytes as literal text (or as plain emphasis if the
1050/// pattern matches an opener).
1051/// Lookahead helper: at a `[` or `![` byte under Pandoc dialect, return
1052/// the total byte length of the bracket-shape link/image if it forms a
1053/// valid one, else `None`. Used by `build_ir` to suppress autolink /
1054/// raw HTML / native span recognition inside Pandoc link text —
1055/// pandoc-native treats link text as opaque to those constructs
1056/// (CommonMark spec example #526 / #538 differs). Mirrors the
1057/// dispatcher's `try_parse_*` precedence so the lookahead, the IR's
1058/// `process_brackets` resolution, and the dispatcher's emission agree
1059/// on the bracket-shape's byte boundaries.
1060fn try_pandoc_bracket_link_extent(
1061 text: &str,
1062 pos: usize,
1063 end: usize,
1064 config: &ParserOptions,
1065) -> Option<usize> {
1066 let bytes = text.as_bytes();
1067 let exts = &config.extensions;
1068 let ctx = LinkScanContext::from_options(config);
1069 let allow_shortcut = exts.shortcut_reference_links;
1070
1071 // `![...]` images.
1072 if bytes[pos] == b'!' {
1073 if pos + 1 >= end || bytes[pos + 1] != b'[' {
1074 return None;
1075 }
1076 if exts.inline_images
1077 && let Some((len, _, _, _)) = try_parse_inline_image(&text[pos..], ctx)
1078 && pos + len <= end
1079 {
1080 return Some(len);
1081 }
1082 if exts.reference_links
1083 && let Some((len, _, _, _, _)) =
1084 try_parse_reference_image(&text[pos..], allow_shortcut, exts.spaced_reference_links)
1085 && pos + len <= end
1086 {
1087 return Some(len);
1088 }
1089 return None;
1090 }
1091
1092 // `[...]` openers — try in dispatcher order. Footnote refs
1093 // (`[^id]`), bracketed citations (`[@cite]`), and bracketed spans
1094 // (`[text]{attrs}`) are recognised by their own dedicated branches
1095 // in `build_ir` and don't need this lookahead.
1096 if exts.inline_links
1097 && let Some((len, _, _, _)) = try_parse_inline_link(&text[pos..], false, ctx)
1098 && pos + len <= end
1099 {
1100 return Some(len);
1101 }
1102 if exts.reference_links
1103 && let Some((len, _, _, _, _)) = try_parse_reference_link(
1104 &text[pos..],
1105 allow_shortcut,
1106 exts.inline_links,
1107 exts.spaced_reference_links,
1108 ctx,
1109 )
1110 && pos + len <= end
1111 {
1112 return Some(len);
1113 }
1114
1115 None
1116}
1117
1118fn is_unicode_punct_or_symbol(c: char) -> bool {
1119 if c.is_ascii() {
1120 c.is_ascii_punctuation()
1121 } else {
1122 !c.is_alphanumeric() && !c.is_whitespace()
1123 }
1124}
1125
1126fn is_left_flanking(text: &str, run_start: usize, run_len: usize) -> bool {
1127 let after = run_start + run_len;
1128 let next_char = text.get(after..).and_then(|s| s.chars().next());
1129 let prev_char = (run_start > 0)
1130 .then(|| text[..run_start].chars().last())
1131 .flatten();
1132
1133 let followed_by_ws = next_char.is_none_or(|c| c.is_whitespace());
1134 if followed_by_ws {
1135 return false;
1136 }
1137 let followed_by_punct = next_char.is_some_and(is_unicode_punct_or_symbol);
1138 if !followed_by_punct {
1139 return true;
1140 }
1141 prev_char.is_none_or(|c| c.is_whitespace() || is_unicode_punct_or_symbol(c))
1142}
1143
1144fn is_right_flanking(text: &str, run_start: usize, run_len: usize) -> bool {
1145 let after = run_start + run_len;
1146 let next_char = text.get(after..).and_then(|s| s.chars().next());
1147 let prev_char = (run_start > 0)
1148 .then(|| text[..run_start].chars().last())
1149 .flatten();
1150
1151 let preceded_by_ws = prev_char.is_none_or(|c| c.is_whitespace());
1152 if preceded_by_ws {
1153 return false;
1154 }
1155 let preceded_by_punct = prev_char.is_some_and(is_unicode_punct_or_symbol);
1156 if !preceded_by_punct {
1157 return true;
1158 }
1159 next_char.is_none_or(|c| c.is_whitespace() || is_unicode_punct_or_symbol(c))
1160}
1161
1162// ============================================================================
1163// Pass 2: Process emphasis (CommonMark §6.2)
1164// ============================================================================
1165
1166/// Run the CommonMark §6.3 `process_emphasis` algorithm over the IR's
1167/// delim runs. Mutates the IR in place: matched runs gain entries in their
1168/// `matches` vec, unmatched bytes stay implicit (the emission pass treats
1169/// any byte not covered by a match as literal text).
1170///
1171/// The algorithm tracks a per-bucket `openers_bottom` exclusive lower
1172/// bound to keep walk-back bounded; consume rules and the §6.2 mod-3
1173/// rejection match the reference implementation.
1174pub fn process_emphasis(events: &mut [IrEvent], dialect: crate::options::Dialect) {
1175 process_emphasis_in_range(events, 0, events.len(), dialect);
1176}
1177
1178/// Range-scoped variant of [`process_emphasis`].
1179///
1180/// Only delim runs whose IR event index lies in `[lo, hi)` are considered.
1181/// Used by [`build_full_plans`] to run emphasis pairing inside each
1182/// resolved bracket pair *before* the global top-level pass, so emphasis
1183/// can never form across a link's bracket boundary (CommonMark §6.3
1184/// requires bracket resolution to happen first when at a `]`, with
1185/// emphasis processed on the link's inner range).
1186///
1187/// The function additionally skips delim runs that already carry a
1188/// recorded match in their `matches` vec — this lets the second
1189/// (top-level) pass reuse the same algorithm without re-pairing bytes
1190/// already consumed by inner-range passes.
1191pub fn process_emphasis_in_range(
1192 events: &mut [IrEvent],
1193 lo: usize,
1194 hi: usize,
1195 dialect: crate::options::Dialect,
1196) {
1197 process_emphasis_in_range_filtered(events, lo, hi, None, dialect);
1198}
1199
1200/// Internal variant of [`process_emphasis_in_range`] with an optional
1201/// exclusion bitmap. Event indices for which `excluded[i] == true` are
1202/// treated as if their delim run were already fully consumed — used by
1203/// [`build_full_plans`] to keep the top-level emphasis pass from pairing
1204/// across a resolved bracket pair's boundary (the inner delim runs of
1205/// such a pair belong to the link's inner range and were already paired
1206/// by the scoped pass).
1207fn process_emphasis_in_range_filtered(
1208 events: &mut [IrEvent],
1209 lo: usize,
1210 hi: usize,
1211 excluded: Option<&[bool]>,
1212 dialect: crate::options::Dialect,
1213) {
1214 let is_commonmark = dialect == crate::options::Dialect::CommonMark;
1215 if is_commonmark {
1216 run_emphasis_pass(events, lo, hi, excluded, dialect, &[], false);
1217 return;
1218 }
1219 // Pandoc dialect: cascade-then-rerun. Run the standard pass, then
1220 // invalidate Emph/Strong pairs whose inner range contains an
1221 // unmatched same-char run with both can_open && can_close (Pandoc's
1222 // recursive descent would have failed those outer pairs because the
1223 // inner content has a stray, ambiguous delimiter the recursive
1224 // parser cannot pair). The invalidated pairs go into a "rejected
1225 // list" that the next iteration of the standard pass consults to
1226 // pick a different opener for the same closer (or reject the
1227 // closer altogether). Iterate to a fixed point.
1228 //
1229 // The rerun (iter 2+) runs in `strict` mode: a candidate pair is
1230 // rejected if its inner range contains an unmatched same-char run
1231 // with count > pair.count. This mirrors pandoc-markdown's
1232 // recursive-descent semantics where, e.g. inside a failed outer
1233 // `**...**` Strong, the inner `one c` parser's `option2`
1234 // (`string [c,c] >> two c mempty`) greedily consumes a stray `**`
1235 // and prevents subsequent `*` runs from pairing as Emph. Without
1236 // this gate, `**foo *bar** baz*` would produce Emph[bar** baz]
1237 // after the outer Strong invalidation, but pandoc treats it as
1238 // all-literal because the inner `**` blocks the Emph match.
1239 let mut rejected: Vec<(usize, usize)> = Vec::new();
1240 let max_iters = events.len().saturating_add(2);
1241 let mut iter = 0;
1242 loop {
1243 let strict = iter > 0;
1244 run_emphasis_pass(events, lo, hi, excluded, dialect, &rejected, strict);
1245 let invalidations = pandoc_cascade_invalidate(events, excluded);
1246 if invalidations.is_empty() {
1247 break;
1248 }
1249 rejected.extend(invalidations);
1250 iter += 1;
1251 if iter >= max_iters {
1252 break;
1253 }
1254 }
1255 // Recovery for `***A **B** C***` patterns: synthesise the inner
1256 // Strong match the standard delim-stack algorithm can't reach.
1257 pandoc_inner_strong_recovery(events);
1258}
1259
1260/// One pass of the CommonMark §6.2 emphasis pairing algorithm over the
1261/// IR's [`DelimRun`](IrEvent::DelimRun) events in `[lo, hi)`. Pandoc
1262/// dialect gates apply when `dialect == Dialect::Pandoc`. The
1263/// `rejected_pairs` list (Pandoc only) excludes specific
1264/// (opener_event_idx, closer_event_idx) pairs from matching — used by
1265/// the cascade-then-rerun loop to prevent invalidated pairs from
1266/// re-forming on the next iteration.
1267fn run_emphasis_pass(
1268 events: &mut [IrEvent],
1269 lo: usize,
1270 hi: usize,
1271 excluded: Option<&[bool]>,
1272 dialect: crate::options::Dialect,
1273 rejected_pairs: &[(usize, usize)],
1274 strict_pandoc: bool,
1275) {
1276 let is_commonmark = dialect == crate::options::Dialect::CommonMark;
1277 let hi = hi.min(events.len());
1278 if lo >= hi {
1279 return;
1280 }
1281 // Indices of DelimRun events within [lo, hi), in order, that have
1282 // not already been fully consumed by an earlier scoped pass and that
1283 // are not in the optional exclusion bitmap.
1284 let mut delim_idxs: Vec<usize> = events[lo..hi]
1285 .iter()
1286 .enumerate()
1287 .filter_map(|(i, e)| {
1288 let abs = lo + i;
1289 match e {
1290 IrEvent::DelimRun { matches, .. }
1291 if matches.is_empty()
1292 && excluded.is_none_or(|ex| ex.get(abs).copied() != Some(true)) =>
1293 {
1294 Some(abs)
1295 }
1296 _ => None,
1297 }
1298 })
1299 .collect();
1300 if delim_idxs.is_empty() {
1301 return;
1302 }
1303
1304 // Working state: count (remaining unmatched chars) and source_start
1305 // (first remaining char) per delim run. Indexed by position in
1306 // `delim_idxs`.
1307 let mut count: Vec<usize> = Vec::with_capacity(delim_idxs.len());
1308 let mut source_start: Vec<usize> = Vec::with_capacity(delim_idxs.len());
1309 let mut removed: Vec<bool> = vec![false; delim_idxs.len()];
1310
1311 for &ev_idx in &delim_idxs {
1312 if let IrEvent::DelimRun { start, end, .. } = &events[ev_idx] {
1313 count.push(end - start);
1314 source_start.push(*start);
1315 }
1316 }
1317
1318 // openers_bottom[ch_idx][len%3][can_open] → exclusive lower bound
1319 // (an index into `delim_idxs`, or None meaning "no bottom yet").
1320 let mut openers_bottom: [[[Option<usize>; 2]; 3]; 2] = [[[None; 2]; 3]; 2];
1321
1322 // First active index, scanning forward.
1323 let first_active =
1324 |removed: &[bool]| -> Option<usize> { (0..removed.len()).find(|&i| !removed[i]) };
1325 let next_active = |removed: &[bool], from: usize| -> Option<usize> {
1326 (from + 1..removed.len()).find(|&i| !removed[i])
1327 };
1328 let prev_active =
1329 |removed: &[bool], from: usize| -> Option<usize> { (0..from).rev().find(|&i| !removed[i]) };
1330
1331 let min_closer_count = 1usize;
1332 let mut closer_local = first_active(&removed);
1333 while let Some(c) = closer_local {
1334 let ev_c_idx = delim_idxs[c];
1335 let (ch_c, can_open_c, can_close_c) = match &events[ev_c_idx] {
1336 IrEvent::DelimRun {
1337 ch,
1338 can_open,
1339 can_close,
1340 ..
1341 } => (*ch, *can_open, *can_close),
1342 _ => unreachable!(),
1343 };
1344 if !can_close_c || removed[c] || count[c] < min_closer_count {
1345 closer_local = next_active(&removed, c);
1346 continue;
1347 }
1348
1349 let ch_idx = if ch_c == b'*' { 0 } else { 1 };
1350 let closer_mod = count[c] % 3;
1351 let closer_open_bucket = can_open_c as usize;
1352 let bottom = openers_bottom[ch_idx][closer_mod][closer_open_bucket];
1353
1354 // Walk back to find a compatible opener.
1355 let mut found_opener: Option<usize> = None;
1356 let mut walk = prev_active(&removed, c);
1357 while let Some(o) = walk {
1358 if Some(o) == bottom {
1359 break;
1360 }
1361 let ev_o_idx = delim_idxs[o];
1362 let (ch_o, can_open_o, can_close_o) = match &events[ev_o_idx] {
1363 IrEvent::DelimRun {
1364 ch,
1365 can_open,
1366 can_close,
1367 ..
1368 } => (*ch, *can_open, *can_close),
1369 _ => unreachable!(),
1370 };
1371 if !removed[o] && ch_o == ch_c && can_open_o {
1372 let oc_sum = count[o] + count[c];
1373 let opener_both = can_open_o && can_close_o;
1374 let closer_both = can_open_c && can_close_c;
1375 let mod3_reject = is_commonmark
1376 && (opener_both || closer_both)
1377 && oc_sum.is_multiple_of(3)
1378 && !(count[o].is_multiple_of(3) && count[c].is_multiple_of(3));
1379 // Pandoc-markdown rejects emph/strong pairs whose counts
1380 // disagree in the exactly-(1,2) / (2,1) shape:
1381 // - `**foo*` (2,1): `try_parse_two` looks only for a
1382 // `**` closer; the lone `*` doesn't satisfy that.
1383 // - `*foo**` (1,2): `try_parse_one` encountering `**`
1384 // tries `try_parse_two`; absence of an inner `**`
1385 // closer cascades the outer parse to fail.
1386 // Other count combinations DO match (verified against
1387 // `pandoc -f markdown`):
1388 // - (1,3) / (3,1) → emph match, opposite-side
1389 // leftover `**` literal.
1390 // - (2,3) / (3,2) → strong match, single `*` literal.
1391 // - (3,3) → STRONG(EM(...)) nested.
1392 // - (1..3, 4+) → match (Pandoc's ender walks the
1393 // closer run for a valid position; algorithm
1394 // consumes leftmost via leftover-as-literal).
1395 // Opener count >= 4 is rejected (Pandoc's
1396 // `try_parse_emphasis` has no count-4+ dispatch).
1397 let pandoc_reject = !is_commonmark
1398 && ((count[o] == 1 && count[c] == 2)
1399 || (count[o] == 2 && count[c] == 1)
1400 || count[o] >= 4);
1401 let pair_rejected = !is_commonmark && {
1402 let oe = delim_idxs[o];
1403 let ce = delim_idxs[c];
1404 rejected_pairs.iter().any(|&(ro, rc)| ro == oe && rc == ce)
1405 };
1406 // Pandoc strict-rerun gate (iter 2+ only): block a
1407 // candidate pair if any unmatched same-char run between
1408 // its opener and closer has remaining count strictly
1409 // greater than the consume rule for this pair.
1410 // Mirrors pandoc-markdown's recursive descent where
1411 // `one c`'s `option2` (`string [c,c] >> two c`) would
1412 // greedily consume a stray higher-count run, blocking
1413 // the outer `one c` from finding its `ender c 1` —
1414 // e.g. `**foo *bar** baz*` after the outer Strong
1415 // invalidates: a naïve rerun pairs ev1 (`*`) ↔ ev3
1416 // (`*`) as Emph (consume=1), but pandoc treats the
1417 // `**` between as having "consumed" any further
1418 // matching, leaving everything literal.
1419 let strict_block = strict_pandoc && {
1420 let tentative_consume = if !is_commonmark && count[o] >= 3 && count[c] >= 3 {
1421 1
1422 } else if count[o] >= 2 && count[c] >= 2 {
1423 2
1424 } else {
1425 1
1426 };
1427 let lo_evt = delim_idxs[o] + 1;
1428 let hi_evt = delim_idxs[c];
1429 (lo_evt..hi_evt).any(|k| match &events[k] {
1430 IrEvent::DelimRun {
1431 ch: ch_k,
1432 start,
1433 end,
1434 matches,
1435 ..
1436 } => {
1437 *ch_k == ch_c && {
1438 let total = end - start;
1439 let consumed: usize = matches.iter().map(|m| m.len as usize).sum();
1440 total.saturating_sub(consumed) > tentative_consume
1441 }
1442 }
1443 _ => false,
1444 })
1445 };
1446 if !mod3_reject && !pandoc_reject && !pair_rejected && !strict_block {
1447 found_opener = Some(o);
1448 break;
1449 }
1450 }
1451 if o == 0 {
1452 break;
1453 }
1454 walk = prev_active(&removed, o);
1455 }
1456
1457 if let Some(o) = found_opener {
1458 // Consume rule:
1459 // CommonMark — consume 2 (Strong) when both sides have
1460 // >= 2 chars, else 1 (Emph). For `***x***` (3,3) this
1461 // produces EM(STRONG(...)) because the first match
1462 // consumes 2 from each side (Strong outermost).
1463 // Pandoc — when both sides have >= 3, consume 1 first
1464 // (Emph innermost) leaving 2 + 2 to pair as Strong on
1465 // the second pass. This produces STRONG(EM(...)) for
1466 // `***x***`, matching Pandoc-markdown's recursive
1467 // `try_parse_three` algorithm.
1468 let consume = if !is_commonmark && count[o] >= 3 && count[c] >= 3 {
1469 1
1470 } else if count[o] >= 2 && count[c] >= 2 {
1471 2
1472 } else {
1473 1
1474 };
1475 let kind = if consume == 2 {
1476 EmphasisKind::Strong
1477 } else {
1478 EmphasisKind::Emph
1479 };
1480
1481 // Opener consumes inner-edge (rightmost) chars.
1482 let opener_match_offset =
1483 source_start[o] + count[o] - consume - source_start_event(&events[delim_idxs[o]]);
1484 // Closer consumes inner-edge (leftmost) chars.
1485 let closer_match_offset = source_start[c] - source_start_event(&events[delim_idxs[c]]);
1486
1487 // Record match on opener.
1488 if let IrEvent::DelimRun { matches, .. } = &mut events[delim_idxs[o]] {
1489 matches.push(DelimMatch {
1490 offset_in_run: opener_match_offset as u8,
1491 len: consume as u8,
1492 is_opener: true,
1493 partner_event: delim_idxs[c] as u32,
1494 partner_offset: closer_match_offset as u8,
1495 kind,
1496 });
1497 }
1498 // Record match on closer.
1499 if let IrEvent::DelimRun { matches, .. } = &mut events[delim_idxs[c]] {
1500 matches.push(DelimMatch {
1501 offset_in_run: closer_match_offset as u8,
1502 len: consume as u8,
1503 is_opener: false,
1504 partner_event: delim_idxs[o] as u32,
1505 partner_offset: opener_match_offset as u8,
1506 kind,
1507 });
1508 }
1509
1510 count[o] -= consume;
1511 source_start[c] += consume;
1512 count[c] -= consume;
1513
1514 // Remove all openers strictly between o and c.
1515 let mut between = next_active(&removed, o);
1516 while let Some(idx) = between {
1517 if idx == c {
1518 break;
1519 }
1520 removed[idx] = true;
1521 between = next_active(&removed, idx);
1522 }
1523
1524 if count[o] == 0 {
1525 removed[o] = true;
1526 }
1527 if count[c] == 0 {
1528 removed[c] = true;
1529 closer_local = next_active(&removed, c);
1530 }
1531 // Else re-process the same closer with reduced count.
1532 } else {
1533 openers_bottom[ch_idx][closer_mod][closer_open_bucket] = prev_active(&removed, c);
1534 if !can_open_c {
1535 removed[c] = true;
1536 }
1537 closer_local = next_active(&removed, c);
1538 }
1539 }
1540
1541 // No further mutation needed: matches are recorded; remaining bytes
1542 // stay implicit literal. Pandoc cascade is invoked by the caller
1543 // (`process_emphasis_in_range_filtered`) once per pass so it can
1544 // accumulate invalidations into a rejected-pairs list and re-run.
1545 let _ = (&mut delim_idxs, &mut openers_bottom, min_closer_count);
1546}
1547
1548/// Pandoc-only post-processing pass over [`process_emphasis_in_range_filtered`]
1549/// matches: invalidate any matched delim pair that contains an unmatched
1550/// same-character run between its opener and closer. Returns the list
1551/// of (opener_event_idx, closer_event_idx) pairs that were invalidated
1552/// in this call, so the caller can seed a rejected-pairs list and
1553/// re-run the standard pass — this lets Pandoc re-pair the inner runs
1554/// that the invalidated outer match would have stolen via
1555/// between-removal (e.g. `*foo **bar* baz**` → after the outer
1556/// `ev0..ev2` Emph is invalidated, `ev1..ev3` matches as Strong on the
1557/// next iteration).
1558fn pandoc_cascade_invalidate(
1559 events: &mut [IrEvent],
1560 excluded: Option<&[bool]>,
1561) -> Vec<(usize, usize)> {
1562 let mut invalidated_pairs: Vec<(usize, usize)> = Vec::new();
1563 // Early-exit: if there are no `DelimRun` events at all, the cascade
1564 // pass is a no-op. Avoids allocating the two scratch vecs below for
1565 // every range with no `*`/`_` runs (which is the common case for
1566 // ranges that contain only standalone constructs / brackets).
1567 if !events.iter().any(|e| matches!(e, IrEvent::DelimRun { .. })) {
1568 return invalidated_pairs;
1569 }
1570 let is_excluded = |k: usize| excluded.is_some_and(|ex| ex.get(k).copied() == Some(true));
1571 // Reuse two scratch vecs across the inner loop iterations instead
1572 // of `.collect()` each time. These are tiny per-paragraph
1573 // allocations but the function is called for every Pandoc inline
1574 // emphasis pass and shows up in malloc traffic.
1575 let mut total: Vec<usize> = Vec::with_capacity(events.len());
1576 let mut consumed: Vec<usize> = Vec::with_capacity(events.len());
1577 loop {
1578 total.clear();
1579 consumed.clear();
1580 // Compute total bytes (run length) and consumed bytes (sum of
1581 // match lens) per DelimRun event index.
1582 total.extend(events.iter().map(|e| match e {
1583 IrEvent::DelimRun { start, end, .. } => end - start,
1584 _ => 0,
1585 }));
1586 consumed.extend(events.iter().map(|e| match e {
1587 IrEvent::DelimRun { matches, .. } => matches.iter().map(|m| m.len as usize).sum(),
1588 _ => 0,
1589 }));
1590
1591 // Find a pair to invalidate. We invalidate one and restart so
1592 // the cascade can re-evaluate dependent pairs.
1593 let mut to_invalidate: Option<(usize, u8)> = None;
1594 'outer: for opener_idx in 0..events.len() {
1595 let IrEvent::DelimRun {
1596 ch: ch_o, matches, ..
1597 } = &events[opener_idx]
1598 else {
1599 continue;
1600 };
1601 for (mi, m) in matches.iter().enumerate() {
1602 if !m.is_opener {
1603 continue;
1604 }
1605 let closer_idx = m.partner_event as usize;
1606 if closer_idx <= opener_idx || closer_idx >= events.len() {
1607 continue;
1608 }
1609 // Scan events strictly between opener and closer for any
1610 // DelimRun with the same `ch`, unmatched bytes, AND
1611 // both `can_open` and `can_close` (i.e., the run could
1612 // have participated in pairing on both sides). A
1613 // can_open-only or can_close-only run is a one-sided
1614 // fragment (e.g. an isolated `*` after a backslash
1615 // escape) that the Pandoc recursive-descent path would
1616 // never have tried as a nested-strong opener — those
1617 // shouldn't cascade-invalidate the surrounding pair.
1618 for k in (opener_idx + 1)..closer_idx {
1619 if is_excluded(k) {
1620 continue;
1621 }
1622 if let IrEvent::DelimRun {
1623 ch: ch_k,
1624 can_open: co_k,
1625 can_close: cc_k,
1626 ..
1627 } = &events[k]
1628 && *ch_k == *ch_o
1629 && consumed[k] < total[k]
1630 && *co_k
1631 && *cc_k
1632 {
1633 to_invalidate = Some((opener_idx, mi as u8));
1634 break 'outer;
1635 }
1636 }
1637 }
1638 }
1639
1640 let Some((opener_idx, mi)) = to_invalidate else {
1641 break;
1642 };
1643
1644 // Look up the partner event/offset before mutating.
1645 let (closer_idx, opener_offset) = match &events[opener_idx] {
1646 IrEvent::DelimRun { matches, .. } => {
1647 let m = matches[mi as usize];
1648 (m.partner_event as usize, m.offset_in_run)
1649 }
1650 _ => break,
1651 };
1652
1653 // Remove the opener match.
1654 if let IrEvent::DelimRun { matches, .. } = &mut events[opener_idx] {
1655 matches.remove(mi as usize);
1656 }
1657 // Remove the corresponding closer match (closer's match has
1658 // is_opener=false and partner_offset == opener's offset_in_run).
1659 if let IrEvent::DelimRun { matches, .. } = &mut events[closer_idx] {
1660 matches.retain(|m| m.is_opener || m.partner_offset != opener_offset);
1661 }
1662 invalidated_pairs.push((opener_idx, closer_idx));
1663 }
1664 invalidated_pairs
1665}
1666
1667/// Pandoc-only post-pass: recover the inner Strong match in
1668/// `***A **B** C***` patterns where the IR's standard pass produced
1669/// `Emph[Strong[A], "B**...** C"]` (matching the outer triple as
1670/// Strong+Emph but losing the inner `**...**`-as-Strong-of-`C` pair).
1671///
1672/// Pandoc's recursive descent here goes
1673/// `three c → ender c 2 → one c → option2 → two c`, producing
1674/// `Emph[Strong[A], "B", Strong[C]]` — two Strong nodes inside an outer
1675/// Emph. The standard delim-stack algorithm can't reach this pairing
1676/// because between-removal during the outer Emph match removes the
1677/// inner closer-side `**` (e.g. `bar**`) from the candidate pool.
1678///
1679/// This recovery scans Emph matches whose opener and closer originally
1680/// had count >= 3, and whose closer has unmatched bytes >= 2 after the
1681/// standard pass; for each, we look for an unmatched same-char
1682/// between-run with count >= 2 and `can_close = true` (the would-be
1683/// inner-Strong opener) and synthesise a Strong match that consumes
1684/// the leftmost 2 bytes of the closer (where the existing Emph match
1685/// shifts to the rightmost 1 byte). The byte-position rewrite lets
1686/// the CST emission produce well-nested `Emph[..., Strong[...]]` —
1687/// outer Emph close at the rightmost outer-triple byte, inner Strong
1688/// close at the leftmost two.
1689fn pandoc_inner_strong_recovery(events: &mut [IrEvent]) {
1690 let n = events.len();
1691 // (between_idx, opener_idx, closer_idx, len)
1692 let mut to_apply: Vec<(usize, usize, usize, u8)> = Vec::new();
1693
1694 for opener_idx in 0..n {
1695 let (open_total, open_matches_clone, ch_o) = match &events[opener_idx] {
1696 IrEvent::DelimRun {
1697 start,
1698 end,
1699 matches,
1700 ch,
1701 ..
1702 } => (*end - *start, matches.clone(), *ch),
1703 _ => continue,
1704 };
1705 if open_total < 3 {
1706 continue;
1707 }
1708
1709 for m in open_matches_clone.iter() {
1710 if !m.is_opener || m.kind != EmphasisKind::Emph {
1711 continue;
1712 }
1713 let closer_idx = m.partner_event as usize;
1714 if closer_idx <= opener_idx || closer_idx >= n {
1715 continue;
1716 }
1717
1718 let (close_total, close_consumed) = match &events[closer_idx] {
1719 IrEvent::DelimRun {
1720 start,
1721 end,
1722 matches,
1723 ..
1724 } => {
1725 let total = end - start;
1726 let consumed: usize = matches.iter().map(|m| m.len as usize).sum();
1727 (total, consumed)
1728 }
1729 _ => continue,
1730 };
1731 if close_total < 3 {
1732 continue;
1733 }
1734 let leftover = close_total.saturating_sub(close_consumed);
1735 if leftover < 2 {
1736 continue;
1737 }
1738
1739 // Walk backward from closer-1 looking for the rightmost
1740 // unmatched same-char run with count >= 2 and
1741 // can_close=true.
1742 for k in ((opener_idx + 1)..closer_idx).rev() {
1743 if let IrEvent::DelimRun {
1744 ch,
1745 start,
1746 end,
1747 matches,
1748 can_close,
1749 ..
1750 } = &events[k]
1751 {
1752 if *ch != ch_o || !*can_close {
1753 continue;
1754 }
1755 let total = end - start;
1756 let consumed: usize = matches.iter().map(|m| m.len as usize).sum();
1757 let remaining = total.saturating_sub(consumed);
1758 if remaining < 2 {
1759 continue;
1760 }
1761 to_apply.push((k, opener_idx, closer_idx, 2));
1762 break;
1763 }
1764 }
1765 }
1766 }
1767
1768 for (between_idx, opener_idx, closer_idx, len) in to_apply {
1769 // Find the existing Emph match on the closer side.
1770 let (closer_emph_match_idx, closer_emph_offset) = {
1771 let mut found: Option<(usize, u8)> = None;
1772 if let IrEvent::DelimRun { matches, .. } = &events[closer_idx] {
1773 for (mi, m) in matches.iter().enumerate() {
1774 if !m.is_opener
1775 && m.partner_event as usize == opener_idx
1776 && m.kind == EmphasisKind::Emph
1777 {
1778 found = Some((mi, m.offset_in_run));
1779 break;
1780 }
1781 }
1782 }
1783 match found {
1784 Some(x) => x,
1785 None => continue,
1786 }
1787 };
1788
1789 // Find the corresponding Emph match on the opener side.
1790 let opener_emph_match_idx = {
1791 let mut found: Option<usize> = None;
1792 if let IrEvent::DelimRun { matches, .. } = &events[opener_idx] {
1793 for (mi, m) in matches.iter().enumerate() {
1794 if m.is_opener
1795 && m.partner_event as usize == closer_idx
1796 && m.kind == EmphasisKind::Emph
1797 {
1798 found = Some(mi);
1799 break;
1800 }
1801 }
1802 }
1803 match found {
1804 Some(x) => x,
1805 None => continue,
1806 }
1807 };
1808
1809 // Shift the Emph closer's offset to the right of the new
1810 // Strong closer's bytes (Strong takes leftmost `len` bytes,
1811 // Emph takes the next byte).
1812 let new_closer_emph_offset = closer_emph_offset + len;
1813
1814 // Update closer's Emph offset_in_run.
1815 if let IrEvent::DelimRun { matches, .. } = &mut events[closer_idx] {
1816 matches[closer_emph_match_idx].offset_in_run = new_closer_emph_offset;
1817 }
1818 // Update opener's Emph partner_offset to point at the shifted
1819 // Emph closer position.
1820 if let IrEvent::DelimRun { matches, .. } = &mut events[opener_idx] {
1821 matches[opener_emph_match_idx].partner_offset = new_closer_emph_offset;
1822 }
1823
1824 // Add Strong opener match on the between-run.
1825 if let IrEvent::DelimRun { matches, .. } = &mut events[between_idx] {
1826 matches.push(DelimMatch {
1827 offset_in_run: 0,
1828 len,
1829 is_opener: true,
1830 partner_event: closer_idx as u32,
1831 partner_offset: closer_emph_offset,
1832 kind: EmphasisKind::Strong,
1833 });
1834 }
1835 // Add Strong closer match on the closer (at the original
1836 // pre-shift Emph-closer position; the bytes that were the
1837 // single Emph closer now become the leftmost 2 bytes of the
1838 // Strong closer).
1839 if let IrEvent::DelimRun { matches, .. } = &mut events[closer_idx] {
1840 matches.push(DelimMatch {
1841 offset_in_run: closer_emph_offset,
1842 len,
1843 is_opener: false,
1844 partner_event: between_idx as u32,
1845 partner_offset: 0,
1846 kind: EmphasisKind::Strong,
1847 });
1848 }
1849 }
1850}
1851
1852fn source_start_event(event: &IrEvent) -> usize {
1853 match event {
1854 IrEvent::DelimRun { start, .. } => *start,
1855 _ => unreachable!("source_start_event called on non-DelimRun"),
1856 }
1857}
1858
1859// ============================================================================
1860// Pass 3: Process brackets (CommonMark §6.3)
1861// ============================================================================
1862
1863/// Resolve `[`/`![`/`]` markers into link/image nodes per CommonMark §6.3
1864/// (with Pandoc-aware variations under `Dialect::Pandoc`).
1865///
1866/// Walks the IR forward looking for `]` markers. For each one, finds the
1867/// nearest active matching `[`/`` or `[text](dest "title")`.
1871/// 2. Full reference: `[text][label]`, where `label` is in `refdefs`.
1872/// 3. Collapsed reference: `[text][]`, where `text` (normalised) is in
1873/// `refdefs`.
1874/// 4. Shortcut reference: `[text]` not followed by `(` or `[`, where
1875/// `text` (normalised) is in `refdefs`.
1876///
1877/// On a match, the opener gets a `BracketResolution` and the closer is
1878/// flagged `matched`. Under `Dialect::CommonMark`, all earlier active link
1879/// openers are deactivated to implement the §6.3 "links may not contain
1880/// other links" rule (image brackets do not deactivate earlier link
1881/// openers — only links do). Under `Dialect::Pandoc`, the deactivate-pass
1882/// is skipped: pandoc-native is outer-wins for nested links (the inner
1883/// `[inner](u2)` of `[link [inner](u2)](u1)` is literal text inside the
1884/// outer link), and the dispatcher enforces this via a `suppress_inner_links`
1885/// flag during LINK-text recursion. So under Pandoc the IR can leave both
1886/// outer and inner resolved and trust the dispatcher to suppress inner
1887/// LINK emission.
1888///
1889/// On a miss the bracket pair stays opaque-as-literal and the closer is
1890/// dropped from the bracket stack so the next `]` can re-pair.
1891///
1892/// Reference-form resolution consults the refdef map under both
1893/// dialects (CommonMark §6.3 and Pandoc-markdown agree on the
1894/// document-scoped lookup rule). Under Pandoc, when a bracket-shape
1895/// pattern (`[text][label]`, `[text][]`, `[text]`) doesn't resolve to
1896/// a refdef, the opener is tagged with `unresolved_ref = Some(...)`
1897/// and the closer's `matched` is set to `true` so that
1898/// [`build_bracket_plan`] emits a [`BracketDispo::UnresolvedReference`]
1899/// keyed at the opener. Emission then wraps `[start, end)` in an
1900/// `UNRESOLVED_REFERENCE` node — distinct from `LINK` — so downstream
1901/// tools (linter, LSP) can attach behavior to the bracket-shape
1902/// pattern without the parser having to lie about resolution.
1903///
1904/// Under CommonMark, no `unresolved_ref` is recorded; the
1905/// no-resolution fall-through behaves as today (opener deactivated,
1906/// brackets emit as literal text).
1907pub fn process_brackets(
1908 events: &mut [IrEvent],
1909 text: &str,
1910 refdefs: Option<&RefdefMap>,
1911 dialect: crate::options::Dialect,
1912 allow_spaced: bool,
1913) {
1914 let empty: HashSet<String> = HashSet::new();
1915 let labels: &HashSet<String> = match refdefs {
1916 Some(map) => map.as_ref(),
1917 None => &empty,
1918 };
1919 let is_commonmark = dialect == crate::options::Dialect::CommonMark;
1920 // Refdef-aware label resolution under both dialects.
1921 let label_resolves =
1922 |key_norm: &str| -> bool { !key_norm.is_empty() && labels.contains(key_norm) };
1923
1924 // Walk forward through events, treating it as a linear scan for `]`.
1925 let mut i = 0;
1926 while i < events.len() {
1927 let close_pos = match &events[i] {
1928 IrEvent::CloseBracket { pos, .. } => *pos,
1929 _ => {
1930 i += 1;
1931 continue;
1932 }
1933 };
1934
1935 // Find the nearest active OpenBracket before `i`.
1936 let mut o = match find_active_opener(events, i) {
1937 Some(o) => o,
1938 None => {
1939 i += 1;
1940 continue;
1941 }
1942 };
1943
1944 let (open_end, is_image) = match &events[o] {
1945 IrEvent::OpenBracket { end, is_image, .. } => (*end, *is_image),
1946 _ => unreachable!(),
1947 };
1948 let text_start = open_end;
1949 let text_end = close_pos;
1950 let after_close = close_pos + 1;
1951
1952 // 1. Inline link / image.
1953 if let Some((suffix_end, dest, title)) = try_inline_suffix(text, after_close) {
1954 // §6.3 link-in-link rule (CommonMark): if this is a *link*
1955 // (not an image), and any earlier active link opener exists,
1956 // deactivate them. We also deactivate openers strictly before
1957 // `o` here because matching means the inner link wins; the
1958 // spec applies this *after* matching. Pandoc skips this —
1959 // outer-wins is enforced by the dispatcher's
1960 // `suppress_inner_links` flag during LINK-text recursion.
1961 if !is_image && is_commonmark {
1962 deactivate_earlier_link_openers(events, o);
1963 }
1964 commit_resolution(
1965 events,
1966 o,
1967 i,
1968 text_start,
1969 text_end,
1970 after_close,
1971 suffix_end,
1972 LinkKind::Inline { dest, title },
1973 );
1974 // Remove the opener from the bracket stack: it has been
1975 // matched (active=false will fall out automatically since
1976 // resolution is Some).
1977 mark_opener_resolved(events, o);
1978 i += 1;
1979 continue;
1980 }
1981
1982 // 2. Full reference link: `[text][label]`.
1983 let full_ref_suffix = try_full_reference_suffix(text, after_close, allow_spaced);
1984 if let Some((suffix_end, label_raw)) = &full_ref_suffix {
1985 let label_norm = normalize_label(label_raw);
1986 if label_resolves(&label_norm) {
1987 if !is_image && is_commonmark {
1988 deactivate_earlier_link_openers(events, o);
1989 }
1990 commit_resolution(
1991 events,
1992 o,
1993 i,
1994 text_start,
1995 text_end,
1996 after_close,
1997 *suffix_end,
1998 LinkKind::FullReference {
1999 label: label_raw.clone(),
2000 },
2001 );
2002 mark_opener_resolved(events, o);
2003 i += 1;
2004 continue;
2005 }
2006 // Bracketed but unresolved label: §6.3 says we still treat
2007 // `[text][label]` as not-a-link, but the brackets get
2008 // consumed as literal text AND the shortcut form is
2009 // suppressed (since the `]` is followed by a link label).
2010 }
2011
2012 // 3. Collapsed `[]`.
2013 let link_text = &text[text_start..text_end];
2014 let link_text_norm = normalize_label(link_text);
2015 let (is_collapsed, collapsed_suffix_end) =
2016 collapsed_marker_span(text, after_close, allow_spaced)
2017 .map_or((false, after_close + 2), |end| (true, end));
2018
2019 if is_collapsed && label_resolves(&link_text_norm) {
2020 if !is_image && is_commonmark {
2021 deactivate_earlier_link_openers(events, o);
2022 }
2023 commit_resolution(
2024 events,
2025 o,
2026 i,
2027 text_start,
2028 text_end,
2029 after_close,
2030 collapsed_suffix_end,
2031 LinkKind::CollapsedReference,
2032 );
2033 mark_opener_resolved(events, o);
2034 i += 1;
2035 continue;
2036 }
2037 // `[text][]` with text not in refdefs — falls through to
2038 // literal text; shortcut is suppressed (followed by `[]`).
2039
2040 // 4. Shortcut form: `[text]` not followed by `[]` or `[label]`.
2041 // Per CommonMark §6.3: "A shortcut reference link consists of a
2042 // link label that matches a link reference definition elsewhere
2043 // in the document and is not followed by [] or a link label."
2044 // The full-ref / collapsed shape attempts above suppress the
2045 // shortcut even when their labels don't resolve — the bracket
2046 // bytes still get consumed as literal text.
2047 let shortcut_suppressed = full_ref_suffix.is_some() || is_collapsed;
2048 if !shortcut_suppressed && label_resolves(&link_text_norm) {
2049 if !is_image && is_commonmark {
2050 deactivate_earlier_link_openers(events, o);
2051 }
2052 commit_resolution(
2053 events,
2054 o,
2055 i,
2056 text_start,
2057 text_end,
2058 after_close,
2059 after_close,
2060 LinkKind::ShortcutReference,
2061 );
2062 mark_opener_resolved(events, o);
2063 i += 1;
2064 continue;
2065 }
2066
2067 // No resolution. Under Pandoc, the bracket pair is still a
2068 // recognisable reference shape (full / collapsed / shortcut) —
2069 // tag the opener with `unresolved_ref` so emission wraps it
2070 // in an `UNRESOLVED_REFERENCE` node, and mark the closer
2071 // matched so it doesn't fall through to a literal `]` token.
2072 // Under CommonMark, behavior unchanged: deactivate the opener,
2073 // brackets emit as literal text.
2074 //
2075 // Empty-component shapes (`[]`, `[][]`) aren't reference
2076 // patterns even in spirit — pandoc-native treats them as
2077 // literal text — so skip wrapping.
2078 let unresolved_shape = if !is_commonmark {
2079 let (end, has_substantive_label) =
2080 if let Some((suffix_end, label_raw)) = &full_ref_suffix {
2081 (*suffix_end, !normalize_label(label_raw).is_empty())
2082 } else if is_collapsed {
2083 (collapsed_suffix_end, !link_text_norm.is_empty())
2084 } else {
2085 (after_close, !link_text_norm.is_empty())
2086 };
2087 if has_substantive_label {
2088 Some(UnresolvedRefShape {
2089 close_event: i as u32,
2090 text_end,
2091 end,
2092 })
2093 } else {
2094 None
2095 }
2096 } else {
2097 None
2098 };
2099 if let IrEvent::OpenBracket {
2100 active,
2101 unresolved_ref,
2102 ..
2103 } = &mut events[o]
2104 {
2105 *active = false;
2106 *unresolved_ref = unresolved_shape;
2107 }
2108 if unresolved_shape.is_some()
2109 && let IrEvent::CloseBracket { matched, .. } = &mut events[i]
2110 {
2111 *matched = true;
2112 }
2113 let _ = &mut o;
2114 i += 1;
2115 }
2116}
2117
2118fn find_active_opener(events: &[IrEvent], close_idx: usize) -> Option<usize> {
2119 (0..close_idx).rev().find(|&i| {
2120 matches!(
2121 &events[i],
2122 IrEvent::OpenBracket {
2123 active: true,
2124 resolution: None,
2125 ..
2126 }
2127 )
2128 })
2129}
2130
2131fn deactivate_earlier_link_openers(events: &mut [IrEvent], open_idx: usize) {
2132 for ev in &mut events[..open_idx] {
2133 if let IrEvent::OpenBracket {
2134 is_image: false,
2135 active,
2136 resolution: None,
2137 ..
2138 } = ev
2139 {
2140 *active = false;
2141 }
2142 }
2143}
2144
2145fn mark_opener_resolved(events: &mut [IrEvent], open_idx: usize) {
2146 if let IrEvent::OpenBracket { active, .. } = &mut events[open_idx] {
2147 *active = false;
2148 }
2149}
2150
2151#[allow(clippy::too_many_arguments)]
2152fn commit_resolution(
2153 events: &mut [IrEvent],
2154 open_idx: usize,
2155 close_idx: usize,
2156 text_start: usize,
2157 text_end: usize,
2158 suffix_start: usize,
2159 suffix_end: usize,
2160 kind: LinkKind,
2161) {
2162 if let IrEvent::OpenBracket { resolution, .. } = &mut events[open_idx] {
2163 *resolution = Some(BracketResolution {
2164 close_event: close_idx as u32,
2165 text_start,
2166 text_end,
2167 suffix_start,
2168 suffix_end,
2169 kind,
2170 });
2171 }
2172 if let IrEvent::CloseBracket { matched, .. } = &mut events[close_idx] {
2173 *matched = true;
2174 }
2175}
2176
2177/// Try to parse `(dest)` or `(dest "title")` inline link suffix starting
2178/// at `text[pos]`. Returns `(end_pos_exclusive, dest, title)`.
2179fn try_inline_suffix(text: &str, pos: usize) -> Option<(usize, String, Option<String>)> {
2180 let bytes = text.as_bytes();
2181 if pos >= bytes.len() || bytes[pos] != b'(' {
2182 return None;
2183 }
2184 let mut p = pos + 1;
2185 // Skip leading whitespace.
2186 while p < bytes.len() && matches!(bytes[p], b' ' | b'\t' | b'\n') {
2187 p += 1;
2188 }
2189 // Empty `()` — link with empty destination.
2190 if p < bytes.len() && bytes[p] == b')' {
2191 return Some((p + 1, String::new(), None));
2192 }
2193
2194 // Parse destination.
2195 let (dest, dest_end) = parse_link_destination(text, p)?;
2196 p = dest_end;
2197
2198 // Skip whitespace.
2199 while p < bytes.len() && matches!(bytes[p], b' ' | b'\t' | b'\n') {
2200 p += 1;
2201 }
2202
2203 // Optional title.
2204 let mut title = None;
2205 if p < bytes.len() && matches!(bytes[p], b'"' | b'\'' | b'(') {
2206 let (t, t_end) = parse_link_title(text, p)?;
2207 title = Some(t);
2208 p = t_end;
2209 while p < bytes.len() && matches!(bytes[p], b' ' | b'\t' | b'\n') {
2210 p += 1;
2211 }
2212 }
2213
2214 if p >= bytes.len() || bytes[p] != b')' {
2215 return None;
2216 }
2217 Some((p + 1, dest, title))
2218}
2219
2220fn parse_link_destination(text: &str, start: usize) -> Option<(String, usize)> {
2221 let bytes = text.as_bytes();
2222 if start >= bytes.len() {
2223 return None;
2224 }
2225 if bytes[start] == b'<' {
2226 // <bracketed>
2227 let mut p = start + 1;
2228 let begin = p;
2229 while p < bytes.len() && bytes[p] != b'>' && bytes[p] != b'\n' && bytes[p] != b'<' {
2230 if bytes[p] == b'\\' && p + 1 < bytes.len() {
2231 p += 2;
2232 } else {
2233 p += 1;
2234 }
2235 }
2236 if p >= bytes.len() || bytes[p] != b'>' {
2237 return None;
2238 }
2239 let dest = text[begin..p].to_string();
2240 Some((dest, p + 1))
2241 } else {
2242 // unbracketed: balanced parens, no spaces, no controls
2243 let mut p = start;
2244 let mut paren_depth: i32 = 0;
2245 while p < bytes.len() {
2246 let b = bytes[p];
2247 if b == b'\\' && p + 1 < bytes.len() {
2248 p += 2;
2249 continue;
2250 }
2251 if b == b'(' {
2252 paren_depth += 1;
2253 p += 1;
2254 continue;
2255 }
2256 if b == b')' {
2257 if paren_depth == 0 {
2258 break;
2259 }
2260 paren_depth -= 1;
2261 p += 1;
2262 continue;
2263 }
2264 if b == b' ' || b == b'\t' || b == b'\n' || b < 0x20 || b == 0x7f {
2265 break;
2266 }
2267 p += 1;
2268 }
2269 if p == start || paren_depth != 0 {
2270 return None;
2271 }
2272 Some((text[start..p].to_string(), p))
2273 }
2274}
2275
2276fn parse_link_title(text: &str, start: usize) -> Option<(String, usize)> {
2277 let bytes = text.as_bytes();
2278 let q = bytes[start];
2279 let close = match q {
2280 b'"' => b'"',
2281 b'\'' => b'\'',
2282 b'(' => b')',
2283 _ => return None,
2284 };
2285 let mut p = start + 1;
2286 let begin = p;
2287 while p < bytes.len() {
2288 let b = bytes[p];
2289 if b == b'\\' && p + 1 < bytes.len() {
2290 p += 2;
2291 continue;
2292 }
2293 if b == close {
2294 let title = text[begin..p].to_string();
2295 return Some((title, p + 1));
2296 }
2297 p += 1;
2298 }
2299 None
2300}
2301
2302/// Try to parse `[label]` after a `]`. Returns `(suffix_end, label_raw)`.
2303/// For the collapsed form `[]`, returns `None` here (handled separately
2304/// by `collapsed_marker_span`).
2305fn try_full_reference_suffix(
2306 text: &str,
2307 pos: usize,
2308 allow_spaced: bool,
2309) -> Option<(usize, String)> {
2310 let bytes = text.as_bytes();
2311 let bracket_pos = if allow_spaced {
2312 skip_spaced_ref_gap(bytes, pos)
2313 } else {
2314 pos
2315 };
2316 if bracket_pos >= bytes.len() || bytes[bracket_pos] != b'[' {
2317 return None;
2318 }
2319 let label_start = bracket_pos + 1;
2320 let mut p = label_start;
2321 let mut escape_next = false;
2322 while p < bytes.len() {
2323 if escape_next {
2324 escape_next = false;
2325 p += 1;
2326 continue;
2327 }
2328 match bytes[p] {
2329 b'\\' => {
2330 escape_next = true;
2331 p += 1;
2332 }
2333 b']' => break,
2334 b'[' => return None,
2335 b'\n' => {
2336 p += 1;
2337 }
2338 _ => p += 1,
2339 }
2340 }
2341 if p >= bytes.len() || bytes[p] != b']' {
2342 return None;
2343 }
2344 let label = text[label_start..p].to_string();
2345 if label.is_empty() {
2346 return None;
2347 }
2348 Some((p + 1, label))
2349}
2350
2351/// True when `text[pos..]` opens with the collapsed `[]` marker. Under
2352/// `spaced_reference_links`, whitespace before the `[]` is permitted; the
2353/// returned `Some(end)` reports the byte position past the closing `]`.
2354fn collapsed_marker_span(text: &str, pos: usize, allow_spaced: bool) -> Option<usize> {
2355 let bytes = text.as_bytes();
2356 let bracket_pos = if allow_spaced {
2357 skip_spaced_ref_gap(bytes, pos)
2358 } else {
2359 pos
2360 };
2361 if bytes.get(bracket_pos) == Some(&b'[') && bytes.get(bracket_pos + 1) == Some(&b']') {
2362 Some(bracket_pos + 2)
2363 } else {
2364 None
2365 }
2366}
2367
2368/// Skip the whitespace gap permitted by `spaced_reference_links` between a
2369/// closing `]` and the next opening `[`/`[]`: spaces, tabs, and at most one LF.
2370/// Block parsing already guarantees a blank line cannot appear inside a single
2371/// inline-parse range, so a single newline is the upper bound.
2372fn skip_spaced_ref_gap(bytes: &[u8], pos: usize) -> usize {
2373 let mut p = pos;
2374 let mut saw_newline = false;
2375 while p < bytes.len() {
2376 match bytes[p] {
2377 b' ' | b'\t' => p += 1,
2378 b'\n' if !saw_newline => {
2379 saw_newline = true;
2380 p += 1;
2381 }
2382 _ => break,
2383 }
2384 }
2385 p
2386}
2387
2388// ============================================================================
2389// Bracket plan — byte-position-keyed view of resolved brackets, consumed by
2390// the existing emission walk in `core::parse_inline_range_impl`.
2391// ============================================================================
2392
2393/// Disposition of a single bracket byte after [`process_brackets`].
2394#[derive(Debug, Clone)]
2395pub enum BracketDispo {
2396 /// `[` or `![` of a resolved link/image. Emission emits the LINK/IMAGE
2397 /// node and skips past `suffix_end`.
2398 Open {
2399 is_image: bool,
2400 text_start: usize,
2401 text_end: usize,
2402 suffix_start: usize,
2403 suffix_end: usize,
2404 kind: LinkKind,
2405 },
2406 /// Pandoc-only: `[` or `![` of a bracket-shape reference pattern
2407 /// whose label didn't resolve. Emission wraps `[start, end)` in an
2408 /// `UNRESOLVED_REFERENCE` node so downstream tools can attach
2409 /// behavior to the bracket-shape pattern. `text_start..text_end` is
2410 /// the inner text range (between the outer `[`/`![` and `]`).
2411 UnresolvedReference {
2412 is_image: bool,
2413 text_start: usize,
2414 text_end: usize,
2415 end: usize,
2416 },
2417 /// Bracket byte (one of `[`, `]`, or `!`) that fell through to literal
2418 /// text. Emission accumulates into the surrounding text run.
2419 Literal,
2420}
2421
2422/// A byte-keyed view of the IR's bracket resolutions.
2423#[derive(Debug, Default, Clone)]
2424pub struct BracketPlan {
2425 by_pos: BTreeMap<usize, BracketDispo>,
2426}
2427
2428impl BracketPlan {
2429 pub fn lookup(&self, pos: usize) -> Option<&BracketDispo> {
2430 self.by_pos.get(&pos)
2431 }
2432
2433 pub fn is_empty(&self) -> bool {
2434 self.by_pos.is_empty()
2435 }
2436}
2437
2438/// A standalone Pandoc inline construct recognised by `build_ir` and
2439/// dispatched directly from the emission walk. Carries the construct's
2440/// full source range so the emission walk can slice the content for the
2441/// existing `emit_*` helpers without re-running the recognition.
2442#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2443pub enum ConstructDispo {
2444 /// `^[note text]` — emit via `emit_inline_footnote` after slicing
2445 /// the inner content.
2446 InlineFootnote { end: usize },
2447 /// `<span ...>...</span>` — emit via `emit_native_span` after
2448 /// re-parsing the open-tag attributes from the source range.
2449 NativeSpan { end: usize },
2450 /// `[^id]` — emit via `emit_footnote_reference` after extracting
2451 /// the label id from the source range.
2452 FootnoteReference { end: usize },
2453 /// `[@cite]` — emit via `emit_bracketed_citation` after slicing
2454 /// the inner content.
2455 BracketedCitation { end: usize },
2456 /// `@key` or `-@key` — emit via `emit_bare_citation` (or
2457 /// `emit_crossref` when `is_quarto_crossref_key` matches and
2458 /// `extensions.quarto_crossrefs` is enabled).
2459 BareCitation { end: usize },
2460 /// `[content]{attrs}` — emit via `emit_bracketed_span` after
2461 /// slicing the inner content and attribute string.
2462 BracketedSpan { end: usize },
2463 /// `[[url]]` / `[[url|title]]` (or image variant `![[...]]`) —
2464 /// emit via `emit_wikilink` after re-locating the pipe within the
2465 /// source range.
2466 WikiLink { end: usize },
2467}
2468
2469/// A byte-keyed view of the IR's standalone Pandoc constructs that the
2470/// emission walk consumes directly: inline footnotes, native spans,
2471/// footnote references, bracketed citations, bare citations, and
2472/// bracketed spans. Recognition is authoritative in `build_ir` under
2473/// `Dialect::Pandoc`; the dispatcher's legacy branches for these
2474/// constructs (`^[`, `<span>`, `[^id]`, `[@cite]`, `@cite` / `-@cite`,
2475/// `[text]{attrs}`) are gated to `Dialect::CommonMark` only and only
2476/// fire when the relevant extension is explicitly enabled.
2477#[derive(Debug, Default, Clone)]
2478pub struct ConstructPlan {
2479 by_pos: BTreeMap<usize, ConstructDispo>,
2480}
2481
2482impl ConstructPlan {
2483 pub fn lookup(&self, pos: usize) -> Option<&ConstructDispo> {
2484 self.by_pos.get(&pos)
2485 }
2486
2487 pub fn is_empty(&self) -> bool {
2488 self.by_pos.is_empty()
2489 }
2490}
2491
2492/// Build a [`ConstructPlan`] from the resolved IR. Each
2493/// `Construct { kind: InlineFootnote | NativeSpan, .. }` becomes one
2494/// entry keyed at its start byte.
2495pub fn build_construct_plan(events: &[IrEvent]) -> ConstructPlan {
2496 let mut by_pos: BTreeMap<usize, ConstructDispo> = BTreeMap::new();
2497 for ev in events {
2498 if let IrEvent::Construct { start, end, kind } = ev {
2499 match kind {
2500 ConstructKind::InlineFootnote => {
2501 by_pos.insert(*start, ConstructDispo::InlineFootnote { end: *end });
2502 }
2503 ConstructKind::NativeSpan => {
2504 by_pos.insert(*start, ConstructDispo::NativeSpan { end: *end });
2505 }
2506 ConstructKind::FootnoteReference => {
2507 by_pos.insert(*start, ConstructDispo::FootnoteReference { end: *end });
2508 }
2509 ConstructKind::BracketedCitation => {
2510 by_pos.insert(*start, ConstructDispo::BracketedCitation { end: *end });
2511 }
2512 ConstructKind::BareCitation => {
2513 by_pos.insert(*start, ConstructDispo::BareCitation { end: *end });
2514 }
2515 ConstructKind::BracketedSpan => {
2516 by_pos.insert(*start, ConstructDispo::BracketedSpan { end: *end });
2517 }
2518 ConstructKind::WikiLink => {
2519 by_pos.insert(*start, ConstructDispo::WikiLink { end: *end });
2520 }
2521 _ => {}
2522 }
2523 }
2524 }
2525 ConstructPlan { by_pos }
2526}
2527
2528/// Build a [`BracketPlan`] from the resolved IR. Each `OpenBracket`
2529/// resolution becomes an [`BracketDispo::Open`] keyed at the opener's
2530/// start byte. Unresolved openers and unmatched closers become
2531/// `BracketDispo::Literal` so the emission path can recognise them
2532/// without re-parsing.
2533pub fn build_bracket_plan(events: &[IrEvent]) -> BracketPlan {
2534 let mut by_pos: BTreeMap<usize, BracketDispo> = BTreeMap::new();
2535 for ev in events {
2536 match ev {
2537 IrEvent::OpenBracket {
2538 start,
2539 is_image,
2540 resolution: Some(res),
2541 ..
2542 } => {
2543 by_pos.insert(
2544 *start,
2545 BracketDispo::Open {
2546 is_image: *is_image,
2547 text_start: res.text_start,
2548 text_end: res.text_end,
2549 suffix_start: res.suffix_start,
2550 suffix_end: res.suffix_end,
2551 kind: res.kind.clone(),
2552 },
2553 );
2554 }
2555 IrEvent::OpenBracket {
2556 start,
2557 end,
2558 is_image,
2559 resolution: None,
2560 unresolved_ref: Some(shape),
2561 ..
2562 } => {
2563 by_pos.insert(
2564 *start,
2565 BracketDispo::UnresolvedReference {
2566 is_image: *is_image,
2567 text_start: *end,
2568 text_end: shape.text_end,
2569 end: shape.end,
2570 },
2571 );
2572 }
2573 IrEvent::OpenBracket {
2574 start,
2575 is_image,
2576 resolution: None,
2577 unresolved_ref: None,
2578 ..
2579 } => {
2580 let len = if *is_image { 2 } else { 1 };
2581 for off in 0..len {
2582 by_pos.insert(*start + off, BracketDispo::Literal);
2583 }
2584 }
2585 IrEvent::CloseBracket {
2586 pos,
2587 matched: false,
2588 } => {
2589 by_pos.insert(*pos, BracketDispo::Literal);
2590 }
2591 _ => {}
2592 }
2593 }
2594 BracketPlan { by_pos }
2595}
2596
2597/// One-shot helper: build the IR, run all passes, and return the
2598/// bundled [`InlinePlans`] (emphasis dispositions, bracket resolutions,
2599/// and standalone Pandoc constructs) — packaged together so the inline
2600/// emission path can consume them in one go for either dialect.
2601///
2602/// Pass ordering follows the CommonMark §6.3 reference impl: bracket
2603/// resolution runs first, then emphasis is processed *scoped per resolved
2604/// bracket pair's inner event range*, then once more on the residual
2605/// top-level events. This prevents emphasis pairs from forming across a
2606/// link's bracket boundary, which the previous "all-emphasis-then-all-
2607/// brackets" order got wrong (e.g. spec example #473).
2608pub fn build_full_plans(
2609 text: &str,
2610 start: usize,
2611 end: usize,
2612 config: &ParserOptions,
2613) -> InlinePlans {
2614 let mut scratch = ScratchEvents::checkout();
2615 let bundle = scratch.inner.as_mut().unwrap();
2616 bundle.events.clear();
2617 bundle.bracket_pairs.clear();
2618 bundle.excluded.clear();
2619
2620 build_ir_into(text, start, end, config, &mut bundle.events);
2621 // §6.3 bracket resolution runs for both dialects. Under CommonMark
2622 // it enforces refdef-aware shortcut/collapsed/full-ref resolution
2623 // and the §6.3 link-in-link deactivation rule. Under Pandoc it
2624 // performs shape-only resolution (any non-empty label resolves) and
2625 // skips the deactivation pass — pandoc-native is outer-wins for
2626 // nested links and the dispatcher's `suppress_inner_links` flag
2627 // suppresses inner LINK emission during LINK-text recursion.
2628 process_brackets(
2629 &mut bundle.events,
2630 text,
2631 config.refdef_labels.as_ref(),
2632 config.dialect,
2633 config.extensions.spaced_reference_links,
2634 );
2635
2636 // Scoped emphasis pass per resolved bracket pair, innermost first.
2637 // We collect (open_idx, close_idx) pairs of resolved brackets and run
2638 // emphasis only over the events strictly between them. Innermost-first
2639 // ordering matters: an outer link wraps emphasis that wraps an inner
2640 // link, and the inner link's inner range must be paired before the
2641 // outer's inner range so the top-level pass sees consistent state.
2642 // Include both resolved-link bracket pairs and Pandoc unresolved-
2643 // reference bracket pairs in the scoping set. The latter wrap into
2644 // an `UNRESOLVED_REFERENCE` CST node, which is just as much a tree
2645 // boundary for emphasis as a resolved `LINK` — emphasis must not
2646 // pair across the wrapper's brackets, otherwise the emission walk
2647 // produces a non-tree-shaped CST.
2648 bundle.bracket_pairs.extend(
2649 bundle
2650 .events
2651 .iter()
2652 .enumerate()
2653 .filter_map(|(i, ev)| match ev {
2654 IrEvent::OpenBracket {
2655 resolution: Some(res),
2656 ..
2657 } => Some((i, res.close_event as usize)),
2658 IrEvent::OpenBracket {
2659 resolution: None,
2660 unresolved_ref: Some(shape),
2661 ..
2662 } => Some((i, shape.close_event as usize)),
2663 _ => None,
2664 }),
2665 );
2666 // Innermost-first: sort by close_idx ascending, then open_idx descending.
2667 bundle
2668 .bracket_pairs
2669 .sort_by(|a, b| a.1.cmp(&b.1).then(b.0.cmp(&a.0)));
2670 // Iterate pairs by index so we can hold &mut bundle.events while
2671 // reading bundle.bracket_pairs (split borrow on disjoint fields).
2672 for i in 0..bundle.bracket_pairs.len() {
2673 let (open_idx, close_idx) = bundle.bracket_pairs[i];
2674 process_emphasis_in_range(&mut bundle.events, open_idx + 1, close_idx, config.dialect);
2675 }
2676
2677 // Pandoc-only degrade pass for unresolved bracket-shape patterns
2678 // whose interior left any delim-run byte unmatched after the scoped
2679 // emphasis pass. Pandoc-native degrades such brackets to literal `[`
2680 // / `]` text — the user's intent was clearly not a reference. The
2681 // bracket_pairs entry stays so the inner delims remain in the
2682 // top-level exclusion mask (otherwise they'd re-enter pairing and
2683 // could form Emph spans with delims outside, which pandoc never
2684 // does — see the bug_2_emphasis_crosses_brackets_pandoc fixture).
2685 // Flipping `unresolved_ref` to `None` makes `build_bracket_plan`
2686 // emit `BracketDispo::Literal` for the bracket bytes; flipping
2687 // `CloseBracket.matched` to `false` does the same for the `]`.
2688 for i in 0..bundle.bracket_pairs.len() {
2689 let (open_idx, close_idx) = bundle.bracket_pairs[i];
2690 let is_unresolved = matches!(
2691 &bundle.events[open_idx],
2692 IrEvent::OpenBracket {
2693 resolution: None,
2694 unresolved_ref: Some(_),
2695 ..
2696 }
2697 );
2698 if !is_unresolved {
2699 continue;
2700 }
2701 if !range_has_unmatched_delim_bytes(&bundle.events, open_idx + 1, close_idx) {
2702 continue;
2703 }
2704 if let IrEvent::OpenBracket { unresolved_ref, .. } = &mut bundle.events[open_idx] {
2705 *unresolved_ref = None;
2706 }
2707 if let IrEvent::CloseBracket { matched, .. } = &mut bundle.events[close_idx] {
2708 *matched = false;
2709 }
2710 }
2711
2712 // Top-level emphasis pass: handles delim runs that fall outside any
2713 // resolved bracket pair.
2714 let len = bundle.events.len();
2715 if bundle.bracket_pairs.is_empty() {
2716 // Fast path: no resolved brackets means no exclusion mask needed —
2717 // skip the resize-and-fill pass entirely. Common for prose
2718 // paragraphs without inline links.
2719 process_emphasis_in_range_filtered(&mut bundle.events, 0, len, None, config.dialect);
2720 } else {
2721 // Build exclusion bitmap: any delim run whose event index lies
2722 // inside a resolved bracket pair is excluded from the top-level
2723 // pass. Implements the §6.3 boundary rule: emphasis at the top
2724 // level must not pair across a link's brackets.
2725 bundle.excluded.resize(len, false);
2726 for &(open_idx, close_idx) in &bundle.bracket_pairs {
2727 for slot in bundle
2728 .excluded
2729 .iter_mut()
2730 .take(close_idx)
2731 .skip(open_idx + 1)
2732 {
2733 *slot = true;
2734 }
2735 }
2736 process_emphasis_in_range_filtered(
2737 &mut bundle.events,
2738 0,
2739 len,
2740 Some(&bundle.excluded),
2741 config.dialect,
2742 );
2743 }
2744
2745 InlinePlans {
2746 emphasis: build_emphasis_plan(&bundle.events),
2747 brackets: build_bracket_plan(&bundle.events),
2748 constructs: build_construct_plan(&bundle.events),
2749 }
2750}
2751
2752/// Returns true if any [`IrEvent::DelimRun`] in the event range
2753/// `[lo, hi)` has byte coverage from its `matches` vec that is less
2754/// than the run length — i.e. at least one byte of the run failed to
2755/// pair as emphasis. Used by the Pandoc unresolved-reference degrade
2756/// pass in [`build_full_plans`].
2757///
2758/// Delim runs whose flanking rules forbid both opening *and* closing
2759/// (e.g. intraword `_` inside `foo_bar`) are skipped: those bytes were
2760/// never a pairing candidate, so an "unmatched" count for them isn't
2761/// evidence of a failed emphasis attempt. Without this exclusion every
2762/// URL or identifier with an underscore inside an unresolved bracket
2763/// pair would spuriously degrade the bracket-shape to literal text.
2764fn range_has_unmatched_delim_bytes(events: &[IrEvent], lo: usize, hi: usize) -> bool {
2765 let hi = hi.min(events.len());
2766 for ev in &events[lo..hi] {
2767 if let IrEvent::DelimRun {
2768 start,
2769 end,
2770 matches,
2771 can_open,
2772 can_close,
2773 ..
2774 } = ev
2775 {
2776 if !can_open && !can_close {
2777 continue;
2778 }
2779 let total = end - start;
2780 let matched: usize = matches.iter().map(|m| m.len as usize).sum();
2781 if matched < total {
2782 return true;
2783 }
2784 }
2785 }
2786 false
2787}
2788
2789/// Thread-local pool of scratch buffers used by [`build_full_plans`].
2790///
2791/// `build_full_plans` checks out one bundle for the duration of the call
2792/// and returns it on drop so the next call (or a recursive nested call
2793/// from an inline emitter) reuses the allocations. The pool is
2794/// per-thread — the parser is single-threaded — and bounded so a
2795/// long-running editor session can't accumulate stale capacity.
2796struct ScratchEvents {
2797 inner: Option<ScratchBundle>,
2798}
2799
2800#[derive(Default)]
2801struct ScratchBundle {
2802 events: Vec<IrEvent>,
2803 bracket_pairs: Vec<(usize, usize)>,
2804 excluded: Vec<bool>,
2805}
2806
2807thread_local! {
2808 static IR_EVENT_POOL: std::cell::RefCell<Vec<ScratchBundle>> =
2809 const { std::cell::RefCell::new(Vec::new()) };
2810}
2811
2812impl ScratchEvents {
2813 fn checkout() -> Self {
2814 let bundle = IR_EVENT_POOL
2815 .with(|p| p.borrow_mut().pop())
2816 .unwrap_or_default();
2817 Self {
2818 inner: Some(bundle),
2819 }
2820 }
2821}
2822
2823impl Drop for ScratchEvents {
2824 fn drop(&mut self) {
2825 if let Some(mut bundle) = self.inner.take() {
2826 bundle.events.clear();
2827 bundle.bracket_pairs.clear();
2828 bundle.excluded.clear();
2829 // Cap pool depth at 8 (deepest realistic nested-link recursion)
2830 // and drop any bundle whose `events` grew past 8K (a single
2831 // pathological paragraph shouldn't pin a huge allocation
2832 // forever).
2833 if bundle.events.capacity() <= 8192 {
2834 IR_EVENT_POOL.with(|p| {
2835 let mut pool = p.borrow_mut();
2836 if pool.len() < 8 {
2837 pool.push(bundle);
2838 }
2839 });
2840 }
2841 }
2842 }
2843}
2844
2845/// Bundle of plans produced by [`build_full_plans`] and consumed by the
2846/// inline emission walk.
2847#[derive(Debug, Default, Clone)]
2848pub struct InlinePlans {
2849 pub emphasis: EmphasisPlan,
2850 pub brackets: BracketPlan,
2851 pub constructs: ConstructPlan,
2852}
2853
2854/// Convert the IR's delim-run match decisions into an [`EmphasisPlan`],
2855/// preserving the byte-keyed disposition shape the existing emission walk
2856/// consumes.
2857///
2858/// Each match on a [`DelimRun`](IrEvent::DelimRun) produces one entry in
2859/// the plan: the opener side records `Open` with the partner's source
2860/// byte and length; the closer side records `Close`. Bytes within a run
2861/// that are *not* covered by any match get a `Literal` entry, which the
2862/// emission walk uses to coalesce unmatched delimiter bytes with
2863/// surrounding plain text.
2864pub fn build_emphasis_plan(events: &[IrEvent]) -> EmphasisPlan {
2865 let mut by_pos: BTreeMap<usize, DelimChar> = BTreeMap::new();
2866 for ev in events {
2867 if let IrEvent::DelimRun {
2868 start,
2869 end,
2870 matches,
2871 ..
2872 } = ev
2873 {
2874 for m in matches {
2875 let pos = *start + m.offset_in_run as usize;
2876 let partner_run_start = match &events[m.partner_event as usize] {
2877 IrEvent::DelimRun { start: ps, .. } => *ps,
2878 _ => continue,
2879 };
2880 let partner_pos = partner_run_start + m.partner_offset as usize;
2881 if m.is_opener {
2882 by_pos.insert(
2883 pos,
2884 DelimChar::Open {
2885 len: m.len,
2886 partner: partner_pos,
2887 partner_len: m.len,
2888 kind: m.kind,
2889 },
2890 );
2891 } else {
2892 by_pos.insert(pos, DelimChar::Close);
2893 }
2894 }
2895 // Any remaining bytes (not covered by a match) are literal.
2896 for pos in *start..*end {
2897 by_pos.entry(pos).or_insert(DelimChar::Literal);
2898 }
2899 }
2900 }
2901 EmphasisPlan::from_dispositions(by_pos)
2902}
2903
2904#[cfg(test)]
2905mod tests {
2906 use super::*;
2907 use crate::options::Flavor;
2908 use crate::parser::inlines::inline_ir::DelimChar;
2909 use std::sync::Arc;
2910
2911 fn cm_opts() -> ParserOptions {
2912 let flavor = Flavor::CommonMark;
2913 ParserOptions {
2914 flavor,
2915 dialect: crate::options::Dialect::for_flavor(flavor),
2916 extensions: crate::options::Extensions::for_flavor(flavor),
2917 pandoc_compat: crate::options::PandocCompat::default(),
2918 crossref_prefixes: Vec::new(),
2919 refdef_labels: None,
2920 }
2921 }
2922
2923 fn refdefs<I: IntoIterator<Item = &'static str>>(labels: I) -> RefdefMap {
2924 Arc::new(labels.into_iter().map(|s| s.to_string()).collect())
2925 }
2926
2927 #[test]
2928 fn ir_event_range_covers_all_variants() {
2929 let txt = IrEvent::Text { start: 0, end: 5 };
2930 assert_eq!(txt.range(), (0, 5));
2931
2932 let close = IrEvent::CloseBracket {
2933 pos: 7,
2934 matched: false,
2935 };
2936 assert_eq!(close.range(), (7, 8));
2937
2938 let open = IrEvent::OpenBracket {
2939 start: 1,
2940 end: 3,
2941 is_image: true,
2942 active: true,
2943 resolution: None,
2944 unresolved_ref: None,
2945 };
2946 assert_eq!(open.range(), (1, 3));
2947 }
2948
2949 #[test]
2950 fn scan_records_text_and_delim_run() {
2951 let opts = cm_opts();
2952 let ir = build_ir("foo *bar*", 0, 9, &opts);
2953 // Expect: Text "foo ", DelimRun "*", Text "bar", DelimRun "*"
2954 assert!(matches!(ir[0], IrEvent::Text { start: 0, end: 4 }));
2955 assert!(matches!(
2956 ir[1],
2957 IrEvent::DelimRun {
2958 ch: b'*',
2959 start: 4,
2960 end: 5,
2961 ..
2962 }
2963 ));
2964 assert!(matches!(ir[2], IrEvent::Text { start: 5, end: 8 }));
2965 assert!(matches!(
2966 ir[3],
2967 IrEvent::DelimRun {
2968 ch: b'*',
2969 start: 8,
2970 end: 9,
2971 ..
2972 }
2973 ));
2974 }
2975
2976 #[test]
2977 fn scan_records_brackets() {
2978 let opts = cm_opts();
2979 let ir = build_ir("[foo]", 0, 5, &opts);
2980 assert!(matches!(
2981 ir[0],
2982 IrEvent::OpenBracket {
2983 start: 0,
2984 end: 1,
2985 is_image: false,
2986 ..
2987 }
2988 ));
2989 assert!(matches!(ir[1], IrEvent::Text { start: 1, end: 4 }));
2990 assert!(matches!(
2991 ir[2],
2992 IrEvent::CloseBracket {
2993 pos: 4,
2994 matched: false
2995 }
2996 ));
2997 }
2998
2999 #[test]
3000 fn scan_records_image_bracket() {
3001 let opts = cm_opts();
3002 let ir = build_ir("![alt]", 0, 6, &opts);
3003 assert!(matches!(
3004 ir[0],
3005 IrEvent::OpenBracket {
3006 start: 0,
3007 end: 2,
3008 is_image: true,
3009 ..
3010 }
3011 ));
3012 }
3013
3014 #[test]
3015 fn scan_handles_code_span_opacity() {
3016 let opts = cm_opts();
3017 let ir = build_ir("a `*x*` b", 0, 9, &opts);
3018 // Code span `*x*` should be a Construct, NOT delim runs.
3019 let has_delim_run = ir.iter().any(|e| matches!(e, IrEvent::DelimRun { .. }));
3020 assert!(
3021 !has_delim_run,
3022 "code span content should not produce delim runs"
3023 );
3024 assert!(ir.iter().any(|e| matches!(
3025 e,
3026 IrEvent::Construct {
3027 kind: ConstructKind::CodeSpan,
3028 ..
3029 }
3030 )));
3031 }
3032
3033 #[test]
3034 fn process_emphasis_simple_pair() {
3035 let opts = cm_opts();
3036 let mut ir = build_ir("*foo*", 0, 5, &opts);
3037 process_emphasis(&mut ir, opts.dialect);
3038 // First DelimRun (open) gets a match.
3039 let opener = ir
3040 .iter()
3041 .find(|e| matches!(e, IrEvent::DelimRun { start: 0, .. }))
3042 .unwrap();
3043 if let IrEvent::DelimRun { matches, .. } = opener {
3044 assert_eq!(matches.len(), 1);
3045 assert!(matches[0].is_opener);
3046 assert_eq!(matches[0].kind, EmphasisKind::Emph);
3047 }
3048 }
3049
3050 #[test]
3051 fn brackets_resolve_inline_link() {
3052 let opts = cm_opts();
3053 let mut ir = build_ir("[foo](/url)", 0, 11, &opts);
3054 process_brackets(&mut ir, "[foo](/url)", None, opts.dialect, false);
3055 let open = ir
3056 .iter()
3057 .find(|e| matches!(e, IrEvent::OpenBracket { start: 0, .. }))
3058 .unwrap();
3059 if let IrEvent::OpenBracket { resolution, .. } = open {
3060 let r = resolution.as_ref().expect("inline link resolved");
3061 assert!(matches!(r.kind, LinkKind::Inline { .. }));
3062 if let LinkKind::Inline { dest, .. } = &r.kind {
3063 assert_eq!(dest, "/url");
3064 }
3065 }
3066 }
3067
3068 #[test]
3069 fn brackets_shortcut_resolves_only_with_refdef() {
3070 let opts = cm_opts();
3071 let text = "[foo]";
3072 let map = refdefs(["foo"]);
3073 let mut ir = build_ir(text, 0, text.len(), &opts);
3074 process_brackets(&mut ir, text, Some(&map), opts.dialect, false);
3075 let open = ir
3076 .iter()
3077 .find(|e| matches!(e, IrEvent::OpenBracket { start: 0, .. }))
3078 .unwrap();
3079 if let IrEvent::OpenBracket { resolution, .. } = open {
3080 assert!(matches!(
3081 resolution.as_ref().unwrap().kind,
3082 LinkKind::ShortcutReference
3083 ));
3084 }
3085 }
3086
3087 #[test]
3088 fn brackets_shortcut_falls_through_without_refdef() {
3089 // CMark example #523 mechanic: `[bar* baz]` is not a refdef, so
3090 // it must NOT resolve as a link — the brackets stay literal so
3091 // the inner `*` becomes available to the outer emphasis scanner.
3092 let opts = cm_opts();
3093 let text = "[bar* baz]";
3094 let mut ir = build_ir(text, 0, text.len(), &opts);
3095 process_brackets(&mut ir, text, None, opts.dialect, false);
3096 let open = ir
3097 .iter()
3098 .find(|e| matches!(e, IrEvent::OpenBracket { start: 0, .. }))
3099 .unwrap();
3100 if let IrEvent::OpenBracket { resolution, .. } = open {
3101 assert!(resolution.is_none(), "no refdef → bracket stays literal");
3102 }
3103 }
3104
3105 /// Spec #473: `*[bar*](/url)`. The link `[bar*](/url)` resolves; the
3106 /// outer `*...*` MUST NOT pair across the link's bracket boundary,
3107 /// because the inner `*` belongs to the link text.
3108 #[test]
3109 fn full_plans_emphasis_does_not_cross_resolved_link_boundary() {
3110 let opts = cm_opts();
3111 let text = "*[bar*](/url)";
3112 let plans = build_full_plans(text, 0, text.len(), &opts);
3113 // The leading `*` (at byte 0) must NOT be matched as an emphasis
3114 // opener — there's no closer outside the link, and the inner `*`
3115 // (at byte 5) is inside the resolved link's text range so it must
3116 // not be paired with byte 0.
3117 assert!(
3118 matches!(plans.emphasis.lookup(0), Some(DelimChar::Literal) | None),
3119 "outer `*` at byte 0 must not pair across link boundary, got {:?}",
3120 plans.emphasis.lookup(0)
3121 );
3122 // The link `[bar*](/url)` must resolve (opener at byte 1).
3123 assert!(
3124 matches!(plans.brackets.lookup(1), Some(BracketDispo::Open { .. })),
3125 "link [bar*](/url) must resolve at byte 1"
3126 );
3127 }
3128
3129 fn pandoc_opts() -> ParserOptions {
3130 let flavor = Flavor::Pandoc;
3131 ParserOptions {
3132 flavor,
3133 dialect: crate::options::Dialect::for_flavor(flavor),
3134 extensions: crate::options::Extensions::for_flavor(flavor),
3135 pandoc_compat: crate::options::PandocCompat::default(),
3136 crossref_prefixes: Vec::new(),
3137 refdef_labels: None,
3138 }
3139 }
3140
3141 /// Bug #2 (a): unresolved Pandoc bracket-shape with unmatched delim
3142 /// inside its text degrades to literal `[`/`]`. Outer emphasis pair
3143 /// across the (now-literal) brackets must form.
3144 #[test]
3145 fn full_plans_unresolved_bracket_degrades_when_inner_delim_unmatched() {
3146 let opts = pandoc_opts();
3147 let text = "*foo [bar*] baz*";
3148 let plans = build_full_plans(text, 0, text.len(), &opts);
3149 assert!(
3150 matches!(plans.brackets.lookup(5), Some(BracketDispo::Literal) | None),
3151 "degraded `[` at byte 5 must be Literal/None, got {:?}",
3152 plans.brackets.lookup(5)
3153 );
3154 assert!(
3155 matches!(plans.emphasis.lookup(0), Some(DelimChar::Open { .. })),
3156 "outer `*` at byte 0 must open Emph after degrade, got {:?}",
3157 plans.emphasis.lookup(0)
3158 );
3159 }
3160
3161 /// Intraword `_` (e.g. inside a URL like
3162 /// `hyperparameter_optimization`) is not flanking — `can_open` and
3163 /// `can_close` are both false — so it can never pair as emphasis.
3164 /// The degrade pass must not treat such delim runs as "failed
3165 /// emphasis attempts" and demote the surrounding bracket-shape to
3166 /// literal text, otherwise every URL/identifier inside an
3167 /// unresolved reference round-trips through `\[` / `\]` escapes
3168 /// under `tex_math_single_backslash` and reparses as display math.
3169 #[test]
3170 fn full_plans_unresolved_bracket_keeps_wrapper_with_intraword_underscore() {
3171 let opts = pandoc_opts();
3172 let text = "[foo_bar more]";
3173 let plans = build_full_plans(text, 0, text.len(), &opts);
3174 assert!(
3175 matches!(
3176 plans.brackets.lookup(0),
3177 Some(BracketDispo::UnresolvedReference { .. })
3178 ),
3179 "wrapper must be preserved across intraword `_`, got {:?}",
3180 plans.brackets.lookup(0)
3181 );
3182 }
3183
3184 /// Bug #2 (b): unresolved Pandoc bracket whose interior emphasis
3185 /// pairs cleanly keeps the wrapper (linter/LSP hook).
3186 #[test]
3187 fn full_plans_unresolved_bracket_keeps_wrapper_when_inner_paired() {
3188 let opts = pandoc_opts();
3189 let text = "[foo *bar*]";
3190 let plans = build_full_plans(text, 0, text.len(), &opts);
3191 assert!(
3192 matches!(
3193 plans.brackets.lookup(0),
3194 Some(BracketDispo::UnresolvedReference { .. })
3195 ),
3196 "wrapper must be preserved when inner emph pairs, got {:?}",
3197 plans.brackets.lookup(0)
3198 );
3199 }
3200
3201 /// Spec #533: `[foo *bar [baz][ref]*][ref]` with `[ref]: /uri`.
3202 /// Inner `[baz][ref]` resolves as a link; §6.3 link-in-link rule
3203 /// deactivates the outer `[foo ...][ref]` so it falls through to
3204 /// literal brackets. Emphasis `*bar [baz][ref]*` wraps the inner link.
3205 #[test]
3206 fn full_plans_link_in_link_suppression_for_reference_links() {
3207 let opts = cm_opts();
3208 let text = "[foo *bar [baz][ref]*][ref]";
3209 let mut opts_with_refs = opts.clone();
3210 let labels: HashSet<String> = ["ref".to_string()].into_iter().collect();
3211 opts_with_refs.refdef_labels = Some(std::sync::Arc::new(labels));
3212 let plans = build_full_plans(text, 0, text.len(), &opts_with_refs);
3213
3214 // Inner `[baz][ref]` opener is at byte 10 — must resolve.
3215 assert!(
3216 matches!(plans.brackets.lookup(10), Some(BracketDispo::Open { .. })),
3217 "inner [baz][ref] must resolve at byte 10, got {:?}",
3218 plans.brackets.lookup(10)
3219 );
3220 // Outer `[foo ...][ref]` opener is at byte 0 — must NOT resolve
3221 // (link-in-link suppression).
3222 assert!(
3223 matches!(plans.brackets.lookup(0), Some(BracketDispo::Literal) | None),
3224 "outer [foo ...][ref] must fall through to literal at byte 0, got {:?}",
3225 plans.brackets.lookup(0)
3226 );
3227 // Trailing `[ref]` after the outer `]` is at byte 22 — it's a
3228 // standalone shortcut reference and must resolve.
3229 assert!(
3230 matches!(plans.brackets.lookup(22), Some(BracketDispo::Open { .. })),
3231 "trailing [ref] must resolve at byte 22, got {:?}",
3232 plans.brackets.lookup(22)
3233 );
3234 // Emphasis `*...*` at bytes 5 and 20 must pair — the scoped
3235 // emphasis pass over the (deactivated) outer bracket's inner
3236 // event range pairs these.
3237 assert!(
3238 matches!(plans.emphasis.lookup(5), Some(DelimChar::Open { .. })),
3239 "emphasis opener at byte 5 must pair, got {:?}",
3240 plans.emphasis.lookup(5)
3241 );
3242 }
3243}