mdwright_document/
ir.rs

1//! Parsed-document intermediate representation.
2//!
3//! The IR is a curated, opinionated view of a Markdown document, built
4//! once at parse time and consumed by lint rules through the public
5//! [`Document`](crate::Document) façade. It hides two things from rule
6//! authors:
7//!
8//! - The pulldown-cmark event stream and its peculiarities (Text-event
9//!   byte ranges that omit preceding `\\` escapes; tight-list items
10//!   that bypass the `Paragraph` tag; container ranges that retain
11//!   blockquote markers on inner lines).
12//! - The post-parse work needed to recover information the parser
13//!   doesn't surface directly: link-reference definitions, code-block
14//!   info strings, list-marker bytes.
15//!
16//! The data-carrier types ([`TextSlice`], [`InlineCode`], [`Heading`],
17//! [`ListGroup`], etc.) are also the public types returned by
18//! `Document`'s accessors. Their fields are public because they are value
19//! objects, not abstractions, and information-hiding on a position
20//! record buys nothing.
21
22use std::ops::Range;
23use std::sync::OnceLock;
24
25use pulldown_cmark::{CodeBlockKind, Event, Tag, TagEnd};
26use regex::Regex;
27
28use crate::format_facts::FormatFacts;
29use crate::gfm::{AutolinkFact, collect_autolinks};
30use crate::line_index::LineIndex;
31use crate::parse;
32use crate::refs::{ReferenceTable, build_reference_table};
33use crate::source::{CanonicalSource, Source};
34use crate::tree::TreeBuilder;
35use crate::util::regex::compile_static;
36use crate::{ParseError, ParseOptions};
37use mdwright_math::{MathError, MathRegion, scan_math_regions};
38
39/// A borrowed slice of source bytes plus its absolute byte range.
40/// The minimal record every rule needs to emit a diagnostic.
41#[derive(Clone, Debug)]
42pub struct TextSlice {
43    pub text: String,
44    pub byte_offset: usize,
45    pub raw_range: Range<usize>,
46}
47
48/// One inline code span. `text` excludes the surrounding backticks;
49/// `raw_range` covers them.
50#[derive(Clone, Debug)]
51pub struct InlineCode {
52    pub text: String,
53    pub byte_offset: usize,
54    pub raw_range: Range<usize>,
55}
56
57/// One fenced or indented code block.
58///
59/// `text` is the body excluding fence lines; `raw_range` covers the
60/// whole block including fences. `info` is the fence info string
61/// (the language tag); empty for indented blocks.
62#[derive(Clone, Debug)]
63pub struct CodeBlock {
64    pub text: String,
65    pub byte_offset: usize,
66    pub raw_range: Range<usize>,
67    pub info: String,
68    pub fenced: bool,
69}
70
71/// One HTML block (`CommonMark` §4.6).
72#[derive(Clone, Debug)]
73pub struct HtmlBlock {
74    pub text: String,
75    pub byte_offset: usize,
76    pub raw_range: Range<usize>,
77}
78
79/// One inline HTML tag (open, close, self-closing, comment, etc.)
80/// embedded in a paragraph.
81#[derive(Clone, Debug)]
82pub struct InlineHtml {
83    pub text: String,
84    pub byte_offset: usize,
85    pub raw_range: Range<usize>,
86}
87
88/// One ATX or setext heading. `text` is the trimmed text content
89/// (`#` markers and trailing whitespace stripped); `raw_range` covers
90/// the whole heading line(s).
91#[derive(Clone, Debug)]
92pub struct Heading {
93    pub text: String,
94    pub byte_offset: usize,
95    pub raw_range: Range<usize>,
96    /// 1 through 6 for `H1`..`H6`.
97    pub level: u32,
98}
99
100/// A contiguous list at one indentation depth. Nested lists are
101/// distinct `ListGroup` entries.
102#[derive(Clone, Debug)]
103pub struct ListGroup {
104    pub raw_range: Range<usize>,
105    pub ordered: bool,
106    pub items: Vec<ListItem>,
107}
108
109/// One item within a [`ListGroup`].
110#[derive(Clone, Debug)]
111pub struct ListItem {
112    pub raw_range: Range<usize>,
113    /// Byte at the start of the marker (`-`, `*`, `+`, or `'0'..='9'`).
114    /// For ordered lists this is the first digit of the index.
115    pub marker_byte: u8,
116}
117
118/// Frontmatter at the document head. Carries the raw slice plus a
119/// tag for which delimiter the source used so the formatter can emit
120/// the same opening and closing markers.
121#[derive(Clone, Debug)]
122pub struct Frontmatter {
123    pub slice: TextSlice,
124    pub delimiter: FrontmatterDelimiter,
125}
126
127/// Frontmatter fence style. `Yaml` uses `---` open and `---`/`...`
128/// close; `Toml` uses `+++` for both.
129#[derive(Copy, Clone, Debug, PartialEq, Eq)]
130pub enum FrontmatterDelimiter {
131    Yaml,
132    Toml,
133}
134
135/// One link reference definition (`[label]: dest`).
136///
137/// The lint-rule surface produced by [`crate::Document::link_defs`].
138///
139/// Pulldown-cmark does not emit definition events, so the document
140/// crate owns a reference-definition scan alongside the event walk.
141#[derive(Clone, Debug)]
142pub struct LinkDef<'a> {
143    pub label: &'a str,
144    pub dest: &'a str,
145    /// Optional title from `"…"`, `'…'`, or `(…)` after the
146    /// destination. Surrounding quotes / parens are excluded.
147    pub title: Option<&'a str>,
148    pub raw_range: Range<usize>,
149}
150
151/// One inline suppression directive parsed from a Markdown HTML
152/// comment.
153///
154/// Lint suppression comments recognised by the document parser.
155///
156/// The comment must live on its own source line with up to three spaces
157/// of leading indentation.
158///
159/// Recognised forms:
160///
161/// - `<!-- mdwright: allow rule-a[, rule-b] -->`: silences the
162///   listed rules on the *next block*.
163/// - `<!-- mdwright: allow-next-line rule-a[, rule-b] -->`:
164///   silences on the immediately following source line.
165/// - `<!-- mdwright: disable [rule-a, ...] -->`: opens a region
166///   ending at the matching `enable` (or end of file). An empty
167///   rule list means every known rule.
168/// - `<!-- mdwright: enable [rule-a, ...] -->`: closes a region.
169/// - `<!-- mdwright: disable-all -->` / `<!-- mdwright: enable-all -->`
170///   convenience aliases for `disable` / `enable` with no names.
171#[derive(Clone, Debug)]
172pub struct Suppression {
173    pub kind: SuppressionKind,
174    /// Rule names parsed from the comment body. Empty for the bare
175    /// `disable` / `enable` forms and for `disable-all` / `enable-all`;
176    /// the suppression map expands empty to "every known rule".
177    pub rules: Vec<String>,
178    pub raw_range: Range<usize>,
179}
180
181/// One top-level block checkpoint in canonical source coordinates.
182#[derive(Copy, Clone, Debug)]
183pub struct BlockCheckpointFact {
184    pub byte: u32,
185    pub parser_state: u64,
186}
187
188#[derive(Copy, Clone, Debug, PartialEq, Eq)]
189pub enum SuppressionKind {
190    Allow { scope: AllowScope },
191    Disable,
192    Enable,
193}
194
195#[derive(Copy, Clone, Debug, PartialEq, Eq)]
196pub enum AllowScope {
197    /// The next block (paragraph, heading, code block, list group).
198    Block,
199    /// The single source line immediately after the comment.
200    NextLine,
201}
202
203/// The parsed document. Owned by [`Document`](crate::Document); fields
204/// are `pub(crate)` so the façade can hand out borrowed views.
205#[derive(Debug)]
206pub(crate) struct Ir {
207    pub(crate) prose_chunks: Vec<TextSlice>,
208    pub(crate) autolinks: Vec<AutolinkFact>,
209    pub(crate) inline_codes: Vec<InlineCode>,
210    pub(crate) code_blocks: Vec<CodeBlock>,
211    pub(crate) html_blocks: Vec<HtmlBlock>,
212    pub(crate) inline_html: Vec<InlineHtml>,
213    pub(crate) headings: Vec<Heading>,
214    pub(crate) list_groups: Vec<ListGroup>,
215    pub(crate) refs: ReferenceTable,
216    pub(crate) suppressions: Vec<Suppression>,
217    pub(crate) frontmatter: Option<Frontmatter>,
218    pub(crate) math_regions: Vec<MathRegion>,
219    pub(crate) math_errors: Vec<MathError>,
220    pub(crate) line_index: LineIndex,
221    #[cfg(test)]
222    pub(crate) tree: crate::tree::Tree,
223    pub(crate) list_tightness: Vec<(usize, bool)>,
224    pub(crate) link_like_ranges: Vec<Range<usize>>,
225    pub(crate) block_checkpoints: Vec<BlockCheckpointFact>,
226    pub(crate) format_facts: FormatFacts,
227}
228
229impl Ir {
230    #[tracing::instrument(level = "info", name = "Ir::parse", skip(src), fields(len = src.canonical().len()))]
231    pub(crate) fn parse(src: &Source, opts: ParseOptions) -> Result<Self, ParseError> {
232        let canonical_src = CanonicalSource::from_source(src);
233        let source = canonical_src.as_str();
234        let line_index = LineIndex::new(source);
235        let (fm_end, frontmatter) = split_frontmatter(source);
236        let body = canonical_src.trusted_subrange(fm_end..source.len());
237
238        let mut builder = Builder {
239            source,
240            in_code_block: 0,
241            heading_stack: Vec::new(),
242            list_stack: Vec::new(),
243            code_block_stack: Vec::new(),
244            blockquote_stack: Vec::new(),
245            blockquote_ranges: Vec::new(),
246            list_item_ranges: Vec::new(),
247            prose_chunks: Vec::new(),
248            inline_codes: Vec::new(),
249            code_blocks: Vec::new(),
250            html_blocks: Vec::new(),
251            inline_html: Vec::new(),
252            headings: Vec::new(),
253            list_groups: Vec::new(),
254        };
255        // Collect pulldown events once with absolute byte ranges. The
256        // reference table is built from this event stream (pulldown's
257        // own §4.7 resolution is authoritative); the flat IR is built
258        // first (the math scanner depends on the exclusion zones it
259        // collects), then math regions are computed, then the tree
260        // is built. The tree builder needs math regions so it can
261        // splice `NodeKind::Math` leaves at recognised positions.
262        let events: Vec<(Event<'_>, Range<usize>)> = parse::collect_events_with_offsets(body, parse::options(opts))?
263            .into_iter()
264            .map(|(e, r)| {
265                let abs = r.start.saturating_add(fm_end)..r.end.saturating_add(fm_end);
266                (e, abs)
267            })
268            .collect();
269        let block_checkpoints = build_block_checkpoints(source, &events);
270        for (event, abs) in &events {
271            builder.handle(event.clone(), abs.clone());
272        }
273        tracing::debug!(events = events.len(), "flat-IR walk complete");
274
275        // Math regions: the scanner excludes code spans / blocks /
276        // HTML blocks / inline HTML (regions where `\[` / `\(` / `$`
277        // are not math). Transparent runs (blockquote `>` markers
278        // and list-item continuation indents) let the recogniser
279        // scan across container prefixes without those bytes leaking
280        // into the math body.
281        let transparent_runs = compute_transparent_runs(source, &builder.blockquote_ranges, &builder.list_item_ranges);
282        let math_exclusions: Vec<Range<usize>> = builder
283            .inline_codes
284            .iter()
285            .map(|c| c.raw_range.clone())
286            .chain(builder.code_blocks.iter().map(|c| c.raw_range.clone()))
287            .chain(builder.html_blocks.iter().map(|h| h.raw_range.clone()))
288            .chain(builder.inline_html.iter().map(|h| h.raw_range.clone()))
289            .collect();
290        let (math_regions, math_errors) = scan_math_regions(
291            source,
292            &math_exclusions,
293            &transparent_runs,
294            opts.math().scanner_config(),
295        );
296
297        let mut tree_builder = TreeBuilder::new(source, &math_regions);
298        for (event, abs) in &events {
299            tree_builder.handle(event, abs.clone());
300        }
301        tracing::debug!(nodes = tree_builder.arena_len(), "tree walk complete");
302
303        let autolinks = collect_autolinks(source, &events, opts.extensions().gfm);
304        let bare_events: Vec<Event<'_>> = events.iter().map(|(e, _)| e.clone()).collect();
305        let refs = build_reference_table(&bare_events, source);
306        let suppressions = scan_suppressions(&builder.html_blocks);
307        let tree = tree_builder.finalize(&refs);
308        let list_tightness = tree.list_tightness_by_start();
309        let link_like_ranges = tree.link_like_ranges();
310        let format_facts = FormatFacts::from_parts(
311            source,
312            &events,
313            &autolinks,
314            &math_regions,
315            &builder.code_blocks,
316            &builder.html_blocks,
317            &tree,
318        );
319
320        Ok(Self {
321            prose_chunks: builder.prose_chunks,
322            autolinks,
323            inline_codes: builder.inline_codes,
324            code_blocks: builder.code_blocks,
325            html_blocks: builder.html_blocks,
326            inline_html: builder.inline_html,
327            headings: builder.headings,
328            list_groups: builder.list_groups,
329            refs,
330            suppressions,
331            frontmatter,
332            math_regions,
333            math_errors,
334            line_index,
335            #[cfg(test)]
336            tree,
337            list_tightness,
338            link_like_ranges,
339            block_checkpoints,
340            format_facts,
341        })
342    }
343
344    pub(crate) fn line_index(&self) -> &LineIndex {
345        &self.line_index
346    }
347
348    /// Test-only convenience that builds a [`Source`] from `src` and
349    /// then parses through the chokepoint. Production code constructs
350    /// a [`CanonicalSource`] once at [`crate::Document::parse`] and
351    /// passes it down.
352    ///
353    /// [`Source`]: crate::source::Source
354    /// [`CanonicalSource`]: crate::source::CanonicalSource
355    #[cfg(test)]
356    #[allow(clippy::expect_used, reason = "test helper rejects invalid fixtures")]
357    pub(crate) fn parse_str(src: &str) -> Self {
358        let source = crate::source::Source::new(src);
359        Self::parse(&source, crate::ParseOptions::default()).expect("test Markdown parses")
360    }
361}
362
363fn build_block_checkpoints(source: &str, events: &[(Event<'_>, Range<usize>)]) -> Vec<BlockCheckpointFact> {
364    let source_len = u32::try_from(source.len()).unwrap_or(u32::MAX);
365    let cap = (source.len() / 64).saturating_add(2);
366    let mut points = Vec::with_capacity(cap);
367    points.push(BlockCheckpointFact {
368        byte: 0,
369        parser_state: 0,
370    });
371
372    let mut depth: u32 = 0;
373    let mut event_count: u32 = 0;
374    let try_push = |points: &mut Vec<BlockCheckpointFact>, range_start: usize, depth: u32, event_count: u32| {
375        let byte = u32::try_from(range_start).unwrap_or(u32::MAX);
376        if points.last().is_none_or(|last| last.byte < byte) {
377            points.push(BlockCheckpointFact {
378                byte,
379                parser_state: parser_state_hash(depth, event_count),
380            });
381        }
382    };
383    for (event, range) in events {
384        event_count = event_count.saturating_add(1);
385        walk_checkpoint_event(
386            event.clone(),
387            range.start,
388            &mut depth,
389            event_count,
390            &mut points,
391            &try_push,
392        );
393    }
394    if points.last().is_none_or(|last| last.byte < source_len) {
395        points.push(BlockCheckpointFact {
396            byte: source_len,
397            parser_state: parser_state_hash(depth, event_count),
398        });
399    }
400    points
401}
402
403fn walk_checkpoint_event(
404    event: Event<'_>,
405    range_start: usize,
406    depth: &mut u32,
407    event_count: u32,
408    points: &mut Vec<BlockCheckpointFact>,
409    try_push: &impl Fn(&mut Vec<BlockCheckpointFact>, usize, u32, u32),
410) {
411    match event {
412        Event::Start(tag) if *depth == 0 && is_top_level_block(&tag) => {
413            try_push(points, range_start, *depth, event_count);
414            if is_container(&tag) {
415                *depth = depth.saturating_add(1);
416            }
417        }
418        Event::Start(tag) if is_container(&tag) => {
419            *depth = depth.saturating_add(1);
420        }
421        Event::End(end) if is_container_end(end) => {
422            *depth = depth.saturating_sub(1);
423        }
424        Event::Rule if *depth == 0 => {
425            try_push(points, range_start, *depth, event_count);
426        }
427        Event::Start(_)
428        | Event::End(_)
429        | Event::Text(_)
430        | Event::Code(_)
431        | Event::InlineMath(_)
432        | Event::DisplayMath(_)
433        | Event::Html(_)
434        | Event::InlineHtml(_)
435        | Event::FootnoteReference(_)
436        | Event::SoftBreak
437        | Event::HardBreak
438        | Event::Rule
439        | Event::TaskListMarker(_) => {}
440    }
441}
442
443fn is_top_level_block(tag: &Tag<'_>) -> bool {
444    matches!(
445        tag,
446        Tag::Paragraph
447            | Tag::Heading { .. }
448            | Tag::BlockQuote(_)
449            | Tag::CodeBlock(_)
450            | Tag::HtmlBlock
451            | Tag::List(_)
452            | Tag::Table(_)
453            | Tag::FootnoteDefinition(_)
454    )
455}
456
457fn is_container(tag: &Tag<'_>) -> bool {
458    matches!(
459        tag,
460        Tag::BlockQuote(_)
461            | Tag::List(_)
462            | Tag::Item
463            | Tag::FootnoteDefinition(_)
464            | Tag::Table(_)
465            | Tag::TableHead
466            | Tag::TableRow
467            | Tag::TableCell
468    )
469}
470
471fn is_container_end(end: TagEnd) -> bool {
472    matches!(
473        end,
474        TagEnd::BlockQuote(_)
475            | TagEnd::List(_)
476            | TagEnd::Item
477            | TagEnd::FootnoteDefinition
478            | TagEnd::Table
479            | TagEnd::TableHead
480            | TagEnd::TableRow
481            | TagEnd::TableCell
482    )
483}
484
485fn parser_state_hash(depth: u32, event_count: u32) -> u64 {
486    (u64::from(depth) << 32) | u64::from(event_count)
487}
488
489/// Walks the pulldown-cmark event stream and accumulates IR fields.
490/// One pass per document; no borrow of the IR's final shape.
491struct Builder<'a> {
492    source: &'a str,
493    in_code_block: u32,
494    /// Stack of open headings: `(start_byte, level)`.
495    heading_stack: Vec<(usize, u32)>,
496    /// Stack of open lists; each entry holds the list's start offset,
497    /// whether it is ordered, and items collected so far.
498    list_stack: Vec<OpenList>,
499    /// Stack of open code blocks: `(start_byte, info, fenced)`.
500    code_block_stack: Vec<(usize, String, bool)>,
501    /// Stack of open blockquotes: `start_byte`. Closed entries are
502    /// drained into [`Self::blockquote_ranges`] for the
503    /// transparent-runs computation.
504    blockquote_stack: Vec<usize>,
505    /// Closed blockquote ranges, in close order. Used by
506    /// [`compute_transparent_runs`] to identify lines whose leading
507    /// `>` marker the math recogniser must treat as non-content.
508    blockquote_ranges: Vec<Range<usize>>,
509    /// Closed list-item ranges paired with their continuation-indent
510    /// width (from [`item_indent`]). Used by
511    /// [`compute_transparent_runs`] for continuation-line indentation.
512    list_item_ranges: Vec<(Range<usize>, u8)>,
513    prose_chunks: Vec<TextSlice>,
514    inline_codes: Vec<InlineCode>,
515    code_blocks: Vec<CodeBlock>,
516    html_blocks: Vec<HtmlBlock>,
517    inline_html: Vec<InlineHtml>,
518    headings: Vec<Heading>,
519    list_groups: Vec<ListGroup>,
520}
521
522struct OpenList {
523    start: usize,
524    ordered: bool,
525    items: Vec<ListItem>,
526}
527
528impl Builder<'_> {
529    #[allow(clippy::wildcard_enum_match_arm)] // many irrelevant Event variants
530    fn handle(&mut self, event: Event<'_>, range: Range<usize>) {
531        match event {
532            Event::Start(tag) => self.start(tag, range),
533            Event::End(tag) => self.end(tag, range),
534            Event::Text(_) => self.push_prose(range),
535            Event::Code(_) => self.push_inline_code(range),
536            Event::Html(_) => self.push_html_block(range),
537            Event::InlineHtml(_) => self.push_inline_html(range),
538            // SoftBreak, HardBreak, Rule, FootnoteReference,
539            // TaskListMarker, InlineMath, DisplayMath: none carry
540            // bytes we lint as their own chunks. Math events are
541            // disabled in Options; if they appear, ignore them.
542            _ => {}
543        }
544    }
545
546    #[allow(clippy::wildcard_enum_match_arm)] // many irrelevant Tag variants
547    fn start(&mut self, tag: Tag<'_>, range: Range<usize>) {
548        match tag {
549            Tag::Heading { level, .. } => {
550                self.heading_stack.push((range.start, level as u32));
551            }
552            Tag::CodeBlock(kind) => {
553                self.in_code_block = self.in_code_block.saturating_add(1);
554                let (info, fenced) = match kind {
555                    CodeBlockKind::Fenced(s) => (s.into_string(), true),
556                    CodeBlockKind::Indented => (String::new(), false),
557                };
558                self.code_block_stack.push((range.start, info, fenced));
559            }
560            Tag::List(start) => {
561                self.list_stack.push(OpenList {
562                    start: range.start,
563                    ordered: start.is_some(),
564                    items: Vec::new(),
565                });
566            }
567            Tag::Item => {
568                // Use the parent list's `ordered` flag to scan for the
569                // right marker class; see tree::derive_list_marker_byte
570                // for why `first_non_whitespace_byte(range.start)` is
571                // unsafe across container nesting.
572                let ordered = self.list_stack.last().is_some_and(|l| l.ordered);
573                let marker_byte = derive_item_marker_byte(self.source, range.clone(), ordered).unwrap_or(b'-');
574                let indent = item_continuation_width(self.source, &range);
575                self.list_item_ranges.push((range.clone(), indent));
576                if let Some(open) = self.list_stack.last_mut() {
577                    open.items.push(ListItem {
578                        raw_range: range,
579                        marker_byte,
580                    });
581                }
582            }
583            Tag::BlockQuote(_) => {
584                self.blockquote_stack.push(range.start);
585            }
586            #[allow(clippy::wildcard_enum_match_arm)]
587            _ => {}
588        }
589    }
590
591    #[allow(clippy::wildcard_enum_match_arm)] // many irrelevant TagEnd variants
592    fn end(&mut self, tag: TagEnd, range: Range<usize>) {
593        match tag {
594            TagEnd::Heading(_) => {
595                if let Some((start, level)) = self.heading_stack.pop() {
596                    let end = range.end;
597                    let raw = self.source.get(start..end).unwrap_or("");
598                    let (trimmed, off) = trim_heading(raw);
599                    self.headings.push(Heading {
600                        text: trimmed.to_owned(),
601                        byte_offset: start.saturating_add(off),
602                        raw_range: start..end,
603                        level,
604                    });
605                }
606            }
607            TagEnd::CodeBlock => {
608                self.in_code_block = self.in_code_block.saturating_sub(1);
609                if let Some((start, info, fenced)) = self.code_block_stack.pop() {
610                    let end = range.end;
611                    let raw = self.source.get(start..end).unwrap_or("");
612                    self.code_blocks.push(CodeBlock {
613                        text: raw.to_owned(),
614                        byte_offset: start,
615                        raw_range: start..end,
616                        info,
617                        fenced,
618                    });
619                }
620            }
621            TagEnd::List(_) => {
622                if let Some(open) = self.list_stack.pop() {
623                    self.list_groups.push(ListGroup {
624                        raw_range: open.start..range.end,
625                        ordered: open.ordered,
626                        items: open.items,
627                    });
628                }
629            }
630            TagEnd::BlockQuote(_) => {
631                if let Some(start) = self.blockquote_stack.pop() {
632                    self.blockquote_ranges.push(start..range.end);
633                }
634            }
635            #[allow(clippy::wildcard_enum_match_arm)]
636            _ => {}
637        }
638    }
639
640    fn push_prose(&mut self, range: Range<usize>) {
641        if self.in_code_block > 0 {
642            return;
643        }
644        // Recover a leading backslash that pulldown-cmark consumed as
645        // an escape. The escape is always exactly one byte (`\`) and
646        // sits immediately before the Text event's range.
647        let bytes = self.source.as_bytes();
648        let start = if range.start > 0 && bytes.get(range.start.saturating_sub(1)) == Some(&b'\\') {
649            range.start.saturating_sub(1)
650        } else {
651            range.start
652        };
653        let end = range.end;
654        let Some(text) = self.source.get(start..end) else {
655            return;
656        };
657        self.prose_chunks.push(TextSlice {
658            text: text.to_owned(),
659            byte_offset: start,
660            raw_range: start..end,
661        });
662    }
663
664    fn push_inline_code(&mut self, range: Range<usize>) {
665        let raw = self.source.get(range.clone()).unwrap_or("");
666        let lead = raw.bytes().take_while(|&b| b == b'`').count();
667        let trail = raw.bytes().rev().take_while(|&b| b == b'`').count();
668        let (content_start, content_end) = if lead == 0 || trail == 0 || lead.saturating_add(trail) >= raw.len() {
669            (range.start, range.end)
670        } else {
671            (range.start.saturating_add(lead), range.end.saturating_sub(trail))
672        };
673        let Some(text) = self.source.get(content_start..content_end) else {
674            return;
675        };
676        self.inline_codes.push(InlineCode {
677            text: text.to_owned(),
678            byte_offset: content_start,
679            raw_range: range,
680        });
681    }
682
683    fn push_html_block(&mut self, range: Range<usize>) {
684        let Some(text) = self.source.get(range.clone()) else {
685            return;
686        };
687        self.html_blocks.push(HtmlBlock {
688            text: text.to_owned(),
689            byte_offset: range.start,
690            raw_range: range,
691        });
692    }
693
694    fn push_inline_html(&mut self, range: Range<usize>) {
695        let Some(text) = self.source.get(range.clone()) else {
696            return;
697        };
698        self.inline_html.push(InlineHtml {
699            text: text.to_owned(),
700            byte_offset: range.start,
701            raw_range: range,
702        });
703    }
704}
705
706/// First non-whitespace byte at or after `start`. Used to recover a
707/// list item's marker character, which may be indented under nested
708/// lists.
709/// Scan the source range for the first byte matching the legal list
710/// marker class. Mirrors `tree::derive_list_marker_byte`; pulldown's
711/// item range can include parent-container marker bytes when the
712/// separator after the parent's marker is a tab (see
713/// `fuzz_blockquote_tab_list_marker.in`), so the naive "first
714/// non-whitespace byte at range.start" scan returns the parent's
715/// marker, not the item's.
716fn derive_item_marker_byte(source: &str, range: core::ops::Range<usize>, ordered: bool) -> Option<u8> {
717    source.as_bytes().get(range)?.iter().copied().find(|b| {
718        if ordered {
719            b.is_ascii_digit()
720        } else {
721            matches!(b, b'-' | b'*' | b'+')
722        }
723    })
724}
725
726/// Byte count from the start of the item's first non-blank line up
727/// to and including the single space after the marker. Drives the
728/// list-item branch of [`compute_transparent_runs`]: continuation
729/// lines of the item have this many leading bytes available to peel.
730///
731/// Counts the marker's own leading indentation (so a nested item
732/// whose marker sits at column 2 reports a width that includes those
733/// two spaces). This makes the result usable directly as a "strip
734/// this many bytes" instruction on continuation lines, even when
735/// the item is nested under another list or blockquote.
736fn item_continuation_width(source: &str, raw_range: &Range<usize>) -> u8 {
737    let bytes = source.as_bytes().get(raw_range.clone()).unwrap_or(&[]);
738    let mut i = 0usize;
739    loop {
740        let line_start = i;
741        while bytes.get(i).is_some_and(|&b| b != b'\n') {
742            i = i.saturating_add(1);
743        }
744        let line = bytes.get(line_start..i).unwrap_or(&[]);
745        if line.iter().any(|b| !matches!(*b, b' ' | b'\t' | b'\r')) {
746            let mut j = 0usize;
747            while line.get(j).is_some_and(|b| matches!(*b, b' ' | b'\t')) {
748                j = j.saturating_add(1);
749            }
750            if line.get(j).is_some_and(u8::is_ascii_digit) {
751                while line.get(j).is_some_and(u8::is_ascii_digit) {
752                    j = j.saturating_add(1);
753                }
754                if matches!(line.get(j), Some(b'.' | b')')) {
755                    j = j.saturating_add(1);
756                } else {
757                    return 0;
758                }
759            } else if matches!(line.get(j), Some(b'-' | b'*' | b'+')) {
760                j = j.saturating_add(1);
761            } else {
762                return 0;
763            }
764            if line.get(j) == Some(&b' ') {
765                j = j.saturating_add(1);
766            }
767            return u8::try_from(j).unwrap_or(u8::MAX);
768        }
769        if i >= bytes.len() {
770            return 0;
771        }
772        i = i.saturating_add(1);
773    }
774}
775
776/// Identify byte ranges the math recogniser must treat as if they
777/// don't exist: blockquote `>` markers (plus the optional following
778/// space) and list-item continuation indentation on continuation
779/// lines.
780///
781/// One run per line at most. Sorted by start, non-overlapping.
782/// Top-level prose (no container context) returns an empty `Vec`,
783/// keeping the recogniser's hot path allocation-free.
784fn compute_transparent_runs(
785    source: &str,
786    blockquote_ranges: &[Range<usize>],
787    list_item_ranges: &[(Range<usize>, u8)],
788) -> Vec<Range<usize>> {
789    if blockquote_ranges.is_empty() && list_item_ranges.is_empty() {
790        return Vec::new();
791    }
792    let bytes = source.as_bytes();
793    let mut out: Vec<Range<usize>> = Vec::new();
794    let mut line_start = 0usize;
795    while line_start <= bytes.len() {
796        let line_end = bytes
797            .get(line_start..)
798            .and_then(|s| s.iter().position(|&b| b == b'\n'))
799            .map_or(bytes.len(), |n| line_start.saturating_add(n));
800        let mut cursor = line_start;
801        loop {
802            // Blockquote peel: ≤3 leading spaces, then `>`, then one
803            // optional space. Requires that some blockquote_range
804            // covers the cursor.
805            let mut spaces = 0usize;
806            while spaces < 3 && bytes.get(cursor.saturating_add(spaces)).copied() == Some(b' ') {
807                spaces = spaces.saturating_add(1);
808            }
809            let marker_pos = cursor.saturating_add(spaces);
810            if marker_pos < line_end
811                && bytes.get(marker_pos).copied() == Some(b'>')
812                && blockquote_ranges.iter().any(|r| r.start <= cursor && cursor < r.end)
813            {
814                cursor = marker_pos.saturating_add(1);
815                if cursor < line_end && bytes.get(cursor).copied() == Some(b' ') {
816                    cursor = cursor.saturating_add(1);
817                }
818                continue;
819            }
820            // List-item continuation peel: pick the deepest item
821            // whose first line lies strictly before this line and
822            // which still covers the cursor.
823            let item_width = list_item_ranges
824                .iter()
825                .filter(|(r, _)| r.start < line_start && cursor < r.end)
826                .map(|(r, w)| (r.start, usize::from(*w)))
827                .max_by_key(|(s, _)| *s)
828                .map(|(_, w)| w);
829            if let Some(width) = item_width {
830                let mut consumed = 0usize;
831                while consumed < width
832                    && cursor.saturating_add(consumed) < line_end
833                    && bytes.get(cursor.saturating_add(consumed)).copied() == Some(b' ')
834                {
835                    consumed = consumed.saturating_add(1);
836                }
837                if consumed > 0 {
838                    cursor = cursor.saturating_add(consumed);
839                    continue;
840                }
841            }
842            break;
843        }
844        if cursor > line_start {
845            out.push(line_start..cursor);
846        }
847        if line_end >= bytes.len() {
848            break;
849        }
850        line_start = line_end.saturating_add(1);
851    }
852    out
853}
854
855/// Strip ATX `#` markers and surrounding whitespace from a heading's
856/// raw source range. Returns the trimmed text plus the byte offset of
857/// the first text byte relative to the range start. Handles ATX
858/// (`## Foo`) and setext (`Foo\n---`) shapes. For setext, take the
859/// text up to the first newline.
860fn trim_heading(raw: &str) -> (&str, usize) {
861    let body = raw.strip_suffix('\n').unwrap_or(raw);
862    let body = body.split_once('\n').map_or(body, |(first, _)| first);
863    let lead_hashes = body.bytes().take_while(|&b| b == b'#').count();
864    let after_hashes = body.get(lead_hashes..).unwrap_or("");
865    let lead_ws = after_hashes.bytes().take_while(|&b| b == b' ' || b == b'\t').count();
866    let inner_start = lead_hashes.saturating_add(lead_ws);
867    let inner = body.get(inner_start..).unwrap_or("");
868    let trail_ws = inner.bytes().rev().take_while(|&b| b == b' ' || b == b'\t').count();
869    let after_trail_ws = inner.len().saturating_sub(trail_ws);
870    let no_trail_ws = inner.get(..after_trail_ws).unwrap_or("");
871    let trail_hashes = no_trail_ws.bytes().rev().take_while(|&b| b == b'#').count();
872    let after_trail_hashes = no_trail_ws.len().saturating_sub(trail_hashes);
873    let no_trail_hashes = no_trail_ws.get(..after_trail_hashes).unwrap_or("");
874    let final_trail = no_trail_hashes
875        .bytes()
876        .rev()
877        .take_while(|&b| b == b' ' || b == b'\t')
878        .count();
879    let final_end = no_trail_hashes.len().saturating_sub(final_trail);
880    let text = no_trail_hashes.get(..final_end).unwrap_or("");
881    (text, inner_start)
882}
883
884/// Detect and split off frontmatter at the document start. Returns
885/// the byte offset where the body begins and an optional
886/// [`Frontmatter`] covering the region.
887///
888/// Accepts two delimiters:
889///
890/// - `---\n…\n---\n` (or `…\n...\n`): YAML.
891/// - `+++\n…\n+++\n`: TOML.
892fn split_frontmatter(source: &str) -> (usize, Option<Frontmatter>) {
893    let first_line_end = source.find('\n');
894    let first_line = first_line_end.map_or(source, |n| source.get(..n).unwrap_or(""));
895    let trimmed_first = first_line.trim_end();
896    let delimiter = match trimmed_first {
897        "---" => FrontmatterDelimiter::Yaml,
898        "+++" => FrontmatterDelimiter::Toml,
899        _ => return (0, None),
900    };
901    let body_start = first_line_end.map_or(source.len(), |n| n.saturating_add(1));
902    let Some(rest) = source.get(body_start..) else {
903        return (0, None);
904    };
905    let mut cursor = 0usize;
906    while cursor < rest.len() {
907        let nl = rest
908            .get(cursor..)
909            .and_then(|s| s.find('\n'))
910            .unwrap_or_else(|| rest.len().saturating_sub(cursor));
911        let end_excl = cursor.saturating_add(nl);
912        let line = rest.get(cursor..end_excl).unwrap_or("");
913        let trimmed = line.trim_end();
914        let is_close = match delimiter {
915            FrontmatterDelimiter::Yaml => trimmed == "---" || trimmed == "...",
916            FrontmatterDelimiter::Toml => trimmed == "+++",
917        };
918        if is_close {
919            // Disambiguate a real frontmatter block from a leading
920            // thematic break (`---`) plus a later thematic break that
921            // happens to match the closing delimiter. A YAML / TOML
922            // frontmatter body always contains at least one key-shaped
923            // line (`key:` or `key =`); if none is present we treat
924            // the source as ordinary Markdown. This is the narrowest
925            // rule that preserves every real fixture while rejecting
926            // the round-trip `---\n\n[a][a]\n\n---\n…` shape.
927            let body_text = rest.get(..end_excl).unwrap_or("");
928            if !frontmatter_body_has_key(body_text, delimiter) {
929                return (0, None);
930            }
931            let total = body_start.saturating_add(end_excl).saturating_add(1).min(source.len());
932            let text = source.get(0..total).unwrap_or("");
933            return (
934                total,
935                Some(Frontmatter {
936                    slice: TextSlice {
937                        text: text.to_owned(),
938                        byte_offset: 0,
939                        raw_range: 0..total,
940                    },
941                    delimiter,
942                }),
943            );
944        }
945        cursor = end_excl.saturating_add(1);
946    }
947    // No closing delimiter: the opener is a thematic break (`---`)
948    // or just plain text (`+++`), not a frontmatter fence. Returning
949    // the whole source as frontmatter would byte-preserve the document
950    // by short-circuiting the tree builder, which masks the structural
951    // emit's loose-list normalisation for any document that happens to
952    // start with `---\n`. Treat as no frontmatter and let pulldown
953    // reparse the opener.
954    let _ = delimiter;
955    (0, None)
956}
957
958/// True if `body` contains at least one line shaped like a YAML key
959/// (`name:`) or a TOML key (`name =`). Used by `split_frontmatter` to
960/// reject false positives where the opening `---` is really a thematic
961/// break and a later thematic break supplies the apparent close.
962fn frontmatter_body_has_key(body: &str, delimiter: FrontmatterDelimiter) -> bool {
963    let key_byte = match delimiter {
964        FrontmatterDelimiter::Yaml => b':',
965        FrontmatterDelimiter::Toml => b'=',
966    };
967    body.lines().any(|line| line_has_key(line, key_byte))
968}
969
970fn line_has_key(line: &str, key_byte: u8) -> bool {
971    let bytes = line.as_bytes();
972    let mut i = 0usize;
973    // Optional leading whitespace.
974    while i < bytes.len() && matches!(bytes.get(i).copied(), Some(b' ' | b'\t')) {
975        i = i.saturating_add(1);
976    }
977    // First key byte: ASCII letter or underscore.
978    let start = i;
979    if !matches!(bytes.get(i).copied(), Some(b'a'..=b'z' | b'A'..=b'Z' | b'_')) {
980        return false;
981    }
982    i = i.saturating_add(1);
983    while i < bytes.len()
984        && matches!(
985            bytes.get(i).copied(),
986            Some(b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' | b'.')
987        )
988    {
989        i = i.saturating_add(1);
990    }
991    if i == start {
992        return false;
993    }
994    // Optional whitespace, then the delimiter byte.
995    while i < bytes.len() && matches!(bytes.get(i).copied(), Some(b' ' | b'\t')) {
996        i = i.saturating_add(1);
997    }
998    bytes.get(i).copied() == Some(key_byte)
999}
1000
1001fn suppression_regex() -> &'static Regex {
1002    static RE: OnceLock<Regex> = OnceLock::new();
1003    // Order matters: `allow-next-line` must precede `allow`, and
1004    // `disable-all` / `enable-all` must precede their bare forms,
1005    // because regex alternation is greedy left-to-right.
1006    // Leading whitespace is space-only: tabs do not count as
1007    // indentation (CommonMark §2.2; the mdformat-mkdocs tab bug is
1008    // the negative reference).
1009    RE.get_or_init(|| {
1010        compile_static(
1011            r"^ {0,3}<!--\s*mdwright:\s*(?P<kind>allow-next-line|allow|disable-all|enable-all|disable|enable)(?:[ \t]+(?P<names>[\w\-,\s]+?))?\s*-->\s*$",
1012        )
1013    })
1014}
1015
1016/// Parse suppression directives from HTML comments. Only block-level
1017/// HTML is consulted; pulldown-cmark already distinguishes a comment
1018/// on its own line (`HtmlBlock`) from an inline comment (`InlineHtml`),
1019/// which gives us the "own source line" requirement for free.
1020fn scan_suppressions(html_blocks: &[HtmlBlock]) -> Vec<Suppression> {
1021    let mut out = Vec::new();
1022    let re = suppression_regex();
1023    for block in html_blocks {
1024        let trimmed = block.text.trim_end();
1025        let Some(caps) = re.captures(trimmed) else {
1026            continue;
1027        };
1028        let Some(kind_match) = caps.name("kind") else {
1029            continue;
1030        };
1031        let kind = match kind_match.as_str() {
1032            "allow" => SuppressionKind::Allow {
1033                scope: AllowScope::Block,
1034            },
1035            "allow-next-line" => SuppressionKind::Allow {
1036                scope: AllowScope::NextLine,
1037            },
1038            "disable" | "disable-all" => SuppressionKind::Disable,
1039            "enable" | "enable-all" => SuppressionKind::Enable,
1040            _ => continue,
1041        };
1042        let rules: Vec<String> = caps
1043            .name("names")
1044            .map_or("", |m| m.as_str())
1045            .split([',', ' ', '\t'])
1046            .filter(|s| !s.is_empty())
1047            .map(str::to_owned)
1048            .collect();
1049        // `allow` and `allow-next-line` require explicit names; a bare
1050        // form is malformed syntax and is silently dropped. `disable`
1051        // / `enable` accept an empty name list (= "every known rule").
1052        if matches!(kind, SuppressionKind::Allow { .. }) && rules.is_empty() {
1053            continue;
1054        }
1055        out.push(Suppression {
1056            kind,
1057            rules,
1058            raw_range: block.raw_range.clone(),
1059        });
1060    }
1061    out
1062}
1063
1064#[cfg(test)]
1065#[allow(
1066    clippy::indexing_slicing,
1067    reason = "test asserts; panic surface is the test framework"
1068)]
1069mod tests {
1070    use super::Ir;
1071
1072    fn some_ref<'a, T>(value: Option<&'a T>, label: &str) -> Result<&'a T, String> {
1073        match value {
1074            Some(value) => Ok(value),
1075            None => Err(label.to_owned()),
1076        }
1077    }
1078
1079    #[test]
1080    fn prose_chunks_include_backslash_escapes() {
1081        let ir = Ir::parse_str(r"a \_b\_ c");
1082        let texts: Vec<&str> = ir.prose_chunks.iter().map(|c| c.text.as_str()).collect();
1083        assert!(
1084            texts.iter().any(|t| t.contains(r"\_")),
1085            "prose chunks should preserve `\\_`: {texts:?}"
1086        );
1087    }
1088
1089    #[test]
1090    fn fenced_code_excluded_from_prose() {
1091        let src = "before\n```\nx \\_y\\_ z\n```\nafter \\_outside\\_\n";
1092        let ir = Ir::parse_str(src);
1093        // No chunk should contain the code-block body.
1094        for c in &ir.prose_chunks {
1095            assert!(!c.text.contains("\\_y"), "prose chunk leaked code body: {:?}", c.text);
1096        }
1097        // The escapes outside the fence ARE visible: at least one
1098        // chunk must contain `\_` and at least one must contain
1099        // `outside`. (Text events split at escape boundaries, so the
1100        // full literal `\_outside\_` is spread across multiple chunks.)
1101        let texts: Vec<&str> = ir.prose_chunks.iter().map(|c| c.text.as_str()).collect();
1102        assert!(texts.iter().any(|t| t.contains("\\_")), "no chunk has `\\_`: {texts:?}");
1103        assert!(
1104            texts.iter().any(|t| t.contains("outside")),
1105            "no chunk has `outside`: {texts:?}"
1106        );
1107        assert_eq!(ir.code_blocks.len(), 1);
1108    }
1109
1110    #[test]
1111    fn inline_code_strips_fences() -> Result<(), String> {
1112        let ir = Ir::parse_str("see `foo_bar` here\n");
1113        assert_eq!(ir.inline_codes.len(), 1);
1114        let code = some_ref(ir.inline_codes.first(), "missing")?;
1115        assert_eq!(code.text, "foo_bar");
1116        Ok(())
1117    }
1118
1119    #[test]
1120    fn frontmatter_split() -> Result<(), String> {
1121        let src = "---\ntitle: T\n---\nbody text\n";
1122        let ir = Ir::parse_str(src);
1123        let fm = some_ref(ir.frontmatter.as_ref(), "frontmatter")?;
1124        assert_eq!(fm.delimiter, super::FrontmatterDelimiter::Yaml);
1125        let body_chunks: Vec<&str> = ir.prose_chunks.iter().map(|c| c.text.as_str()).collect();
1126        assert!(body_chunks.iter().any(|t| t == &"body text"));
1127        Ok(())
1128    }
1129
1130    #[test]
1131    fn frontmatter_opener_without_close_is_thematic_break() -> Result<(), String> {
1132        // `---\n` is a YAML opener, but with no closing `---` the
1133        // document is not frontmatter; it is a thematic break
1134        // followed by Markdown. Confirming this via `prose_chunks`:
1135        // body text after the opener must surface as prose, not be
1136        // swallowed into a stub frontmatter.
1137        let src = "---\n\n- a\n- a\n\n- a\n";
1138        let ir = Ir::parse_str(src);
1139        assert!(ir.frontmatter.is_none(), "no frontmatter without close");
1140        let any_a = ir.prose_chunks.iter().any(|c| c.text == "a");
1141        assert!(
1142            any_a,
1143            "body markdown should be parsed as prose, got {:?}",
1144            ir.prose_chunks
1145        );
1146        Ok(())
1147    }
1148
1149    #[test]
1150    fn frontmatter_toml_split() -> Result<(), String> {
1151        let src = "+++\ntitle = \"T\"\n+++\nbody text\n";
1152        let ir = Ir::parse_str(src);
1153        let fm = some_ref(ir.frontmatter.as_ref(), "frontmatter")?;
1154        assert_eq!(fm.delimiter, super::FrontmatterDelimiter::Toml);
1155        let body_chunks: Vec<&str> = ir.prose_chunks.iter().map(|c| c.text.as_str()).collect();
1156        assert!(body_chunks.iter().any(|t| t == &"body text"));
1157        Ok(())
1158    }
1159
1160    #[test]
1161    fn headings_trimmed_and_levelled() {
1162        let ir = Ir::parse_str("# One\n\n## Two ##\n\n### Three\n");
1163        assert_eq!(ir.headings.len(), 3);
1164        let texts: Vec<(&str, u32)> = ir.headings.iter().map(|h| (h.text.as_str(), h.level)).collect();
1165        assert_eq!(texts, vec![("One", 1), ("Two", 2), ("Three", 3)]);
1166    }
1167
1168    #[test]
1169    fn list_groups_record_markers() -> Result<(), String> {
1170        let src = "- one\n- two\n* three\n";
1171        let ir = Ir::parse_str(src);
1172        assert_eq!(ir.list_groups.len(), 2);
1173        let g1 = some_ref(ir.list_groups.first(), "first list")?;
1174        assert!(!g1.ordered);
1175        let markers: Vec<u8> = g1.items.iter().map(|i| i.marker_byte).collect();
1176        assert_eq!(markers, vec![b'-', b'-']);
1177        let g2 = some_ref(ir.list_groups.get(1), "second list")?;
1178        let item = some_ref(g2.items.first(), "item")?;
1179        assert_eq!(item.marker_byte, b'*');
1180        Ok(())
1181    }
1182
1183    #[test]
1184    fn link_defs_scanned() -> Result<(), String> {
1185        let src = "[bar]: https://example.com\n\nSee [ref][bar].\n";
1186        let ir = Ir::parse_str(src);
1187        let target = some_ref(ir.refs.iter().next(), "expected one target")?;
1188        assert_eq!(target.label_raw, "bar");
1189        assert_eq!(target.dest, "https://example.com");
1190        Ok(())
1191    }
1192
1193    #[test]
1194    fn link_defs_skipped_inside_code_block() {
1195        let src = "```\n[bar]: https://example.com\n```\n";
1196        let ir = Ir::parse_str(src);
1197        assert!(ir.refs.is_empty());
1198    }
1199
1200    #[test]
1201    fn inline_html_collected() {
1202        let src = "before <span>x</span> after\n";
1203        let ir = Ir::parse_str(src);
1204        assert!(ir.inline_html.iter().any(|h| h.text == "<span>"));
1205        assert!(ir.inline_html.iter().any(|h| h.text == "</span>"));
1206    }
1207
1208    #[test]
1209    fn code_block_info_string() -> Result<(), String> {
1210        let src = "```rust\nfn x() {}\n```\n";
1211        let ir = Ir::parse_str(src);
1212        assert_eq!(ir.code_blocks.len(), 1);
1213        let cb = some_ref(ir.code_blocks.first(), "cb")?;
1214        assert_eq!(cb.info, "rust");
1215        assert!(cb.fenced);
1216        Ok(())
1217    }
1218
1219    use super::{AllowScope, SuppressionKind};
1220
1221    #[test]
1222    fn suppression_allow_parses() -> Result<(), String> {
1223        let src = "<!-- mdwright: allow heading-punctuation -->\n# Title.\n";
1224        let ir = Ir::parse_str(src);
1225        assert_eq!(ir.suppressions.len(), 1);
1226        let s = some_ref(ir.suppressions.first(), "first")?;
1227        assert_eq!(
1228            s.kind,
1229            SuppressionKind::Allow {
1230                scope: AllowScope::Block
1231            }
1232        );
1233        assert_eq!(s.rules, vec!["heading-punctuation"]);
1234        Ok(())
1235    }
1236
1237    #[test]
1238    fn suppression_allow_next_line_parses() -> Result<(), String> {
1239        let src = "<!-- mdwright: allow-next-line trailing-whitespace -->\nfoo \n";
1240        let ir = Ir::parse_str(src);
1241        let s = some_ref(ir.suppressions.first(), "first")?;
1242        assert_eq!(
1243            s.kind,
1244            SuppressionKind::Allow {
1245                scope: AllowScope::NextLine
1246            }
1247        );
1248        Ok(())
1249    }
1250
1251    #[test]
1252    fn suppression_multiple_rules_parses() -> Result<(), String> {
1253        let src = "<!-- mdwright: allow rule-a, rule-b, rule-c -->\nbody\n";
1254        let ir = Ir::parse_str(src);
1255        let s = some_ref(ir.suppressions.first(), "first")?;
1256        assert_eq!(s.rules, vec!["rule-a", "rule-b", "rule-c"]);
1257        Ok(())
1258    }
1259
1260    #[test]
1261    fn suppression_disable_enable_parse() -> Result<(), String> {
1262        let src = "<!-- mdwright: disable bare-url -->\n\nfoo\n\n<!-- mdwright: enable bare-url -->\n";
1263        let ir = Ir::parse_str(src);
1264        assert_eq!(ir.suppressions.len(), 2);
1265        let first = some_ref(ir.suppressions.first(), "first")?;
1266        let second = some_ref(ir.suppressions.get(1), "second")?;
1267        assert_eq!(first.kind, SuppressionKind::Disable);
1268        assert_eq!(second.kind, SuppressionKind::Enable);
1269        Ok(())
1270    }
1271
1272    #[test]
1273    fn suppression_disable_all_alias_parses() -> Result<(), String> {
1274        let src = "<!-- mdwright: disable-all -->\nfoo\n";
1275        let ir = Ir::parse_str(src);
1276        let s = some_ref(ir.suppressions.first(), "first")?;
1277        assert_eq!(s.kind, SuppressionKind::Disable);
1278        assert!(s.rules.is_empty());
1279        Ok(())
1280    }
1281
1282    #[test]
1283    fn suppression_bare_allow_rejected() {
1284        // `allow` with no names is malformed; silently dropped.
1285        let src = "<!-- mdwright: allow -->\n# Title\n";
1286        let ir = Ir::parse_str(src);
1287        assert!(ir.suppressions.is_empty());
1288    }
1289
1290    #[test]
1291    fn suppression_inline_html_ignored() {
1292        // A comment inside a paragraph is InlineHtml, not HtmlBlock,
1293        // so the scanner doesn't see it. This preserves the "own
1294        // source line" requirement.
1295        let src = "Some text <!-- mdwright: allow bare-url --> more text.\n";
1296        let ir = Ir::parse_str(src);
1297        assert!(ir.suppressions.is_empty());
1298    }
1299
1300    #[test]
1301    fn suppression_with_indent_parses() -> Result<(), String> {
1302        // Up to three spaces of indentation are allowed.
1303        let src = "   <!-- mdwright: allow heading-punctuation -->\n# Title.\n";
1304        let ir = Ir::parse_str(src);
1305        let s = some_ref(ir.suppressions.first(), "first")?;
1306        assert_eq!(s.rules, vec!["heading-punctuation"]);
1307        Ok(())
1308    }
1309
1310    use super::compute_transparent_runs;
1311
1312    #[test]
1313    fn transparent_runs_for_blockquote_continuation() {
1314        // Two `>` lines yield one transparent run per line covering
1315        // the `> ` prefix.
1316        let src = "> a\n> b\n";
1317        let bq = 0..src.len();
1318        let runs = compute_transparent_runs(src, std::slice::from_ref(&bq), &[]);
1319        assert_eq!(runs, vec![0..2, 4..6]);
1320    }
1321
1322    #[test]
1323    fn transparent_runs_for_nested_blockquote() {
1324        // `> > a / > > b`: each line gets one run combining both
1325        // levels of nesting (`> > ` is 4 bytes).
1326        let src = "> > a\n> > b\n";
1327        let outer = 0..src.len();
1328        let inner = 2..src.len();
1329        let runs = compute_transparent_runs(src, &[outer, inner], &[]);
1330        assert_eq!(runs, vec![0..4, 6..10]);
1331    }
1332
1333    #[test]
1334    fn transparent_runs_for_list_item_continuation() {
1335        // `1. a\n   b\n`: line 1 is the marker line (no run); line 2
1336        // is a continuation line whose 3-space indent is stripped.
1337        let src = "1. a\n   b\n";
1338        let item = (0..src.len(), 3);
1339        let runs = compute_transparent_runs(src, &[], &[item]);
1340        assert_eq!(runs, vec![5..8]);
1341    }
1342
1343    #[test]
1344    fn transparent_runs_empty_for_plain_paragraph() {
1345        // No container context → no transparent runs (fast path).
1346        let src = "hello\nworld\n";
1347        let runs = compute_transparent_runs(src, &[], &[]);
1348        assert!(runs.is_empty(), "expected empty: {runs:?}");
1349    }
1350}
mdwright_document/ir.rs

mdwright_document/
ir.rs