use std::ops::Range;
use std::sync::OnceLock;
use pulldown_cmark::{CodeBlockKind, Event, Tag, TagEnd};
use regex::Regex;
use crate::format_facts::FormatFacts;
use crate::gfm::{AutolinkFact, collect_autolinks};
use crate::line_index::LineIndex;
use crate::parse;
use crate::refs::{ReferenceTable, build_reference_table};
use crate::source::{CanonicalSource, Source};
use crate::tree::TreeBuilder;
use crate::util::regex::compile_static;
use crate::{ParseError, ParseOptions};
use mdwright_math::{MathError, MathRegion, scan_math_regions};
#[derive(Clone, Debug)]
pub struct TextSlice {
pub text: String,
pub byte_offset: usize,
pub raw_range: Range<usize>,
}
#[derive(Clone, Debug)]
pub struct InlineCode {
pub text: String,
pub byte_offset: usize,
pub raw_range: Range<usize>,
}
#[derive(Clone, Debug)]
pub struct CodeBlock {
pub text: String,
pub byte_offset: usize,
pub raw_range: Range<usize>,
pub info: String,
pub fenced: bool,
}
#[derive(Clone, Debug)]
pub struct HtmlBlock {
pub text: String,
pub byte_offset: usize,
pub raw_range: Range<usize>,
}
#[derive(Clone, Debug)]
pub struct InlineHtml {
pub text: String,
pub byte_offset: usize,
pub raw_range: Range<usize>,
}
#[derive(Clone, Debug)]
pub struct Heading {
pub text: String,
pub byte_offset: usize,
pub raw_range: Range<usize>,
pub level: u32,
}
#[derive(Clone, Debug)]
pub struct ListGroup {
pub raw_range: Range<usize>,
pub ordered: bool,
pub items: Vec<ListItem>,
}
#[derive(Clone, Debug)]
pub struct ListItem {
pub raw_range: Range<usize>,
pub marker_byte: u8,
}
#[derive(Clone, Debug)]
pub struct Frontmatter {
pub slice: TextSlice,
pub delimiter: FrontmatterDelimiter,
}
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum FrontmatterDelimiter {
Yaml,
Toml,
}
#[derive(Clone, Debug)]
pub struct LinkDef<'a> {
pub label: &'a str,
pub dest: &'a str,
pub title: Option<&'a str>,
pub raw_range: Range<usize>,
}
#[derive(Clone, Debug)]
pub struct Suppression {
pub kind: SuppressionKind,
pub rules: Vec<String>,
pub raw_range: Range<usize>,
}
#[derive(Copy, Clone, Debug)]
pub struct BlockCheckpointFact {
pub byte: u32,
pub parser_state: u64,
}
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum SuppressionKind {
Allow { scope: AllowScope },
Disable,
Enable,
}
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
pub enum AllowScope {
Block,
NextLine,
}
#[derive(Debug)]
pub(crate) struct Ir {
pub(crate) prose_chunks: Vec<TextSlice>,
pub(crate) autolinks: Vec<AutolinkFact>,
pub(crate) inline_codes: Vec<InlineCode>,
pub(crate) code_blocks: Vec<CodeBlock>,
pub(crate) html_blocks: Vec<HtmlBlock>,
pub(crate) inline_html: Vec<InlineHtml>,
pub(crate) headings: Vec<Heading>,
pub(crate) list_groups: Vec<ListGroup>,
pub(crate) refs: ReferenceTable,
pub(crate) suppressions: Vec<Suppression>,
pub(crate) frontmatter: Option<Frontmatter>,
pub(crate) math_regions: Vec<MathRegion>,
pub(crate) math_errors: Vec<MathError>,
pub(crate) line_index: LineIndex,
#[cfg(test)]
pub(crate) tree: crate::tree::Tree,
pub(crate) list_tightness: Vec<(usize, bool)>,
pub(crate) link_like_ranges: Vec<Range<usize>>,
pub(crate) block_checkpoints: Vec<BlockCheckpointFact>,
pub(crate) format_facts: FormatFacts,
}
impl Ir {
#[tracing::instrument(level = "info", name = "Ir::parse", skip(src), fields(len = src.canonical().len()))]
pub(crate) fn parse(src: &Source, opts: ParseOptions) -> Result<Self, ParseError> {
let canonical_src = CanonicalSource::from_source(src);
let source = canonical_src.as_str();
let line_index = LineIndex::new(source);
let (fm_end, frontmatter) = split_frontmatter(source);
let body = canonical_src.trusted_subrange(fm_end..source.len());
let mut builder = Builder {
source,
in_code_block: 0,
heading_stack: Vec::new(),
list_stack: Vec::new(),
code_block_stack: Vec::new(),
blockquote_stack: Vec::new(),
blockquote_ranges: Vec::new(),
list_item_ranges: Vec::new(),
prose_chunks: Vec::new(),
inline_codes: Vec::new(),
code_blocks: Vec::new(),
html_blocks: Vec::new(),
inline_html: Vec::new(),
headings: Vec::new(),
list_groups: Vec::new(),
};
let events: Vec<(Event<'_>, Range<usize>)> = parse::collect_events_with_offsets(body, parse::options(opts))?
.into_iter()
.map(|(e, r)| {
let abs = r.start.saturating_add(fm_end)..r.end.saturating_add(fm_end);
(e, abs)
})
.collect();
let block_checkpoints = build_block_checkpoints(source, &events);
for (event, abs) in &events {
builder.handle(event.clone(), abs.clone());
}
tracing::debug!(events = events.len(), "flat-IR walk complete");
let transparent_runs = compute_transparent_runs(source, &builder.blockquote_ranges, &builder.list_item_ranges);
let math_exclusions: Vec<Range<usize>> = builder
.inline_codes
.iter()
.map(|c| c.raw_range.clone())
.chain(builder.code_blocks.iter().map(|c| c.raw_range.clone()))
.chain(builder.html_blocks.iter().map(|h| h.raw_range.clone()))
.chain(builder.inline_html.iter().map(|h| h.raw_range.clone()))
.collect();
let (math_regions, math_errors) = scan_math_regions(
source,
&math_exclusions,
&transparent_runs,
opts.math().scanner_config(),
);
let mut tree_builder = TreeBuilder::new(source, &math_regions);
for (event, abs) in &events {
tree_builder.handle(event, abs.clone());
}
tracing::debug!(nodes = tree_builder.arena_len(), "tree walk complete");
let autolinks = collect_autolinks(source, &events, opts.extensions().gfm);
let bare_events: Vec<Event<'_>> = events.iter().map(|(e, _)| e.clone()).collect();
let refs = build_reference_table(&bare_events, source);
let suppressions = scan_suppressions(&builder.html_blocks);
let tree = tree_builder.finalize(&refs);
let list_tightness = tree.list_tightness_by_start();
let link_like_ranges = tree.link_like_ranges();
let format_facts = FormatFacts::from_parts(
source,
&events,
&autolinks,
&math_regions,
&builder.code_blocks,
&builder.html_blocks,
&tree,
);
Ok(Self {
prose_chunks: builder.prose_chunks,
autolinks,
inline_codes: builder.inline_codes,
code_blocks: builder.code_blocks,
html_blocks: builder.html_blocks,
inline_html: builder.inline_html,
headings: builder.headings,
list_groups: builder.list_groups,
refs,
suppressions,
frontmatter,
math_regions,
math_errors,
line_index,
#[cfg(test)]
tree,
list_tightness,
link_like_ranges,
block_checkpoints,
format_facts,
})
}
pub(crate) fn line_index(&self) -> &LineIndex {
&self.line_index
}
#[cfg(test)]
#[allow(clippy::expect_used, reason = "test helper rejects invalid fixtures")]
pub(crate) fn parse_str(src: &str) -> Self {
let source = crate::source::Source::new(src);
Self::parse(&source, crate::ParseOptions::default()).expect("test Markdown parses")
}
}
fn build_block_checkpoints(source: &str, events: &[(Event<'_>, Range<usize>)]) -> Vec<BlockCheckpointFact> {
let source_len = u32::try_from(source.len()).unwrap_or(u32::MAX);
let cap = (source.len() / 64).saturating_add(2);
let mut points = Vec::with_capacity(cap);
points.push(BlockCheckpointFact {
byte: 0,
parser_state: 0,
});
let mut depth: u32 = 0;
let mut event_count: u32 = 0;
let try_push = |points: &mut Vec<BlockCheckpointFact>, range_start: usize, depth: u32, event_count: u32| {
let byte = u32::try_from(range_start).unwrap_or(u32::MAX);
if points.last().is_none_or(|last| last.byte < byte) {
points.push(BlockCheckpointFact {
byte,
parser_state: parser_state_hash(depth, event_count),
});
}
};
for (event, range) in events {
event_count = event_count.saturating_add(1);
walk_checkpoint_event(
event.clone(),
range.start,
&mut depth,
event_count,
&mut points,
&try_push,
);
}
if points.last().is_none_or(|last| last.byte < source_len) {
points.push(BlockCheckpointFact {
byte: source_len,
parser_state: parser_state_hash(depth, event_count),
});
}
points
}
fn walk_checkpoint_event(
event: Event<'_>,
range_start: usize,
depth: &mut u32,
event_count: u32,
points: &mut Vec<BlockCheckpointFact>,
try_push: &impl Fn(&mut Vec<BlockCheckpointFact>, usize, u32, u32),
) {
match event {
Event::Start(tag) if *depth == 0 && is_top_level_block(&tag) => {
try_push(points, range_start, *depth, event_count);
if is_container(&tag) {
*depth = depth.saturating_add(1);
}
}
Event::Start(tag) if is_container(&tag) => {
*depth = depth.saturating_add(1);
}
Event::End(end) if is_container_end(end) => {
*depth = depth.saturating_sub(1);
}
Event::Rule if *depth == 0 => {
try_push(points, range_start, *depth, event_count);
}
Event::Start(_)
| Event::End(_)
| Event::Text(_)
| Event::Code(_)
| Event::InlineMath(_)
| Event::DisplayMath(_)
| Event::Html(_)
| Event::InlineHtml(_)
| Event::FootnoteReference(_)
| Event::SoftBreak
| Event::HardBreak
| Event::Rule
| Event::TaskListMarker(_) => {}
}
}
fn is_top_level_block(tag: &Tag<'_>) -> bool {
matches!(
tag,
Tag::Paragraph
| Tag::Heading { .. }
| Tag::BlockQuote(_)
| Tag::CodeBlock(_)
| Tag::HtmlBlock
| Tag::List(_)
| Tag::Table(_)
| Tag::FootnoteDefinition(_)
)
}
fn is_container(tag: &Tag<'_>) -> bool {
matches!(
tag,
Tag::BlockQuote(_)
| Tag::List(_)
| Tag::Item
| Tag::FootnoteDefinition(_)
| Tag::Table(_)
| Tag::TableHead
| Tag::TableRow
| Tag::TableCell
)
}
fn is_container_end(end: TagEnd) -> bool {
matches!(
end,
TagEnd::BlockQuote(_)
| TagEnd::List(_)
| TagEnd::Item
| TagEnd::FootnoteDefinition
| TagEnd::Table
| TagEnd::TableHead
| TagEnd::TableRow
| TagEnd::TableCell
)
}
fn parser_state_hash(depth: u32, event_count: u32) -> u64 {
(u64::from(depth) << 32) | u64::from(event_count)
}
struct Builder<'a> {
source: &'a str,
in_code_block: u32,
heading_stack: Vec<(usize, u32)>,
list_stack: Vec<OpenList>,
code_block_stack: Vec<(usize, String, bool)>,
blockquote_stack: Vec<usize>,
blockquote_ranges: Vec<Range<usize>>,
list_item_ranges: Vec<(Range<usize>, u8)>,
prose_chunks: Vec<TextSlice>,
inline_codes: Vec<InlineCode>,
code_blocks: Vec<CodeBlock>,
html_blocks: Vec<HtmlBlock>,
inline_html: Vec<InlineHtml>,
headings: Vec<Heading>,
list_groups: Vec<ListGroup>,
}
struct OpenList {
start: usize,
ordered: bool,
items: Vec<ListItem>,
}
impl Builder<'_> {
#[allow(clippy::wildcard_enum_match_arm)] fn handle(&mut self, event: Event<'_>, range: Range<usize>) {
match event {
Event::Start(tag) => self.start(tag, range),
Event::End(tag) => self.end(tag, range),
Event::Text(_) => self.push_prose(range),
Event::Code(_) => self.push_inline_code(range),
Event::Html(_) => self.push_html_block(range),
Event::InlineHtml(_) => self.push_inline_html(range),
_ => {}
}
}
#[allow(clippy::wildcard_enum_match_arm)] fn start(&mut self, tag: Tag<'_>, range: Range<usize>) {
match tag {
Tag::Heading { level, .. } => {
self.heading_stack.push((range.start, level as u32));
}
Tag::CodeBlock(kind) => {
self.in_code_block = self.in_code_block.saturating_add(1);
let (info, fenced) = match kind {
CodeBlockKind::Fenced(s) => (s.into_string(), true),
CodeBlockKind::Indented => (String::new(), false),
};
self.code_block_stack.push((range.start, info, fenced));
}
Tag::List(start) => {
self.list_stack.push(OpenList {
start: range.start,
ordered: start.is_some(),
items: Vec::new(),
});
}
Tag::Item => {
let ordered = self.list_stack.last().is_some_and(|l| l.ordered);
let marker_byte = derive_item_marker_byte(self.source, range.clone(), ordered).unwrap_or(b'-');
let indent = item_continuation_width(self.source, &range);
self.list_item_ranges.push((range.clone(), indent));
if let Some(open) = self.list_stack.last_mut() {
open.items.push(ListItem {
raw_range: range,
marker_byte,
});
}
}
Tag::BlockQuote(_) => {
self.blockquote_stack.push(range.start);
}
#[allow(clippy::wildcard_enum_match_arm)]
_ => {}
}
}
#[allow(clippy::wildcard_enum_match_arm)] fn end(&mut self, tag: TagEnd, range: Range<usize>) {
match tag {
TagEnd::Heading(_) => {
if let Some((start, level)) = self.heading_stack.pop() {
let end = range.end;
let raw = self.source.get(start..end).unwrap_or("");
let (trimmed, off) = trim_heading(raw);
self.headings.push(Heading {
text: trimmed.to_owned(),
byte_offset: start.saturating_add(off),
raw_range: start..end,
level,
});
}
}
TagEnd::CodeBlock => {
self.in_code_block = self.in_code_block.saturating_sub(1);
if let Some((start, info, fenced)) = self.code_block_stack.pop() {
let end = range.end;
let raw = self.source.get(start..end).unwrap_or("");
self.code_blocks.push(CodeBlock {
text: raw.to_owned(),
byte_offset: start,
raw_range: start..end,
info,
fenced,
});
}
}
TagEnd::List(_) => {
if let Some(open) = self.list_stack.pop() {
self.list_groups.push(ListGroup {
raw_range: open.start..range.end,
ordered: open.ordered,
items: open.items,
});
}
}
TagEnd::BlockQuote(_) => {
if let Some(start) = self.blockquote_stack.pop() {
self.blockquote_ranges.push(start..range.end);
}
}
#[allow(clippy::wildcard_enum_match_arm)]
_ => {}
}
}
fn push_prose(&mut self, range: Range<usize>) {
if self.in_code_block > 0 {
return;
}
let bytes = self.source.as_bytes();
let start = if range.start > 0 && bytes.get(range.start.saturating_sub(1)) == Some(&b'\\') {
range.start.saturating_sub(1)
} else {
range.start
};
let end = range.end;
let Some(text) = self.source.get(start..end) else {
return;
};
self.prose_chunks.push(TextSlice {
text: text.to_owned(),
byte_offset: start,
raw_range: start..end,
});
}
fn push_inline_code(&mut self, range: Range<usize>) {
let raw = self.source.get(range.clone()).unwrap_or("");
let lead = raw.bytes().take_while(|&b| b == b'`').count();
let trail = raw.bytes().rev().take_while(|&b| b == b'`').count();
let (content_start, content_end) = if lead == 0 || trail == 0 || lead.saturating_add(trail) >= raw.len() {
(range.start, range.end)
} else {
(range.start.saturating_add(lead), range.end.saturating_sub(trail))
};
let Some(text) = self.source.get(content_start..content_end) else {
return;
};
self.inline_codes.push(InlineCode {
text: text.to_owned(),
byte_offset: content_start,
raw_range: range,
});
}
fn push_html_block(&mut self, range: Range<usize>) {
let Some(text) = self.source.get(range.clone()) else {
return;
};
self.html_blocks.push(HtmlBlock {
text: text.to_owned(),
byte_offset: range.start,
raw_range: range,
});
}
fn push_inline_html(&mut self, range: Range<usize>) {
let Some(text) = self.source.get(range.clone()) else {
return;
};
self.inline_html.push(InlineHtml {
text: text.to_owned(),
byte_offset: range.start,
raw_range: range,
});
}
}
fn derive_item_marker_byte(source: &str, range: core::ops::Range<usize>, ordered: bool) -> Option<u8> {
source.as_bytes().get(range)?.iter().copied().find(|b| {
if ordered {
b.is_ascii_digit()
} else {
matches!(b, b'-' | b'*' | b'+')
}
})
}
fn item_continuation_width(source: &str, raw_range: &Range<usize>) -> u8 {
let bytes = source.as_bytes().get(raw_range.clone()).unwrap_or(&[]);
let mut i = 0usize;
loop {
let line_start = i;
while bytes.get(i).is_some_and(|&b| b != b'\n') {
i = i.saturating_add(1);
}
let line = bytes.get(line_start..i).unwrap_or(&[]);
if line.iter().any(|b| !matches!(*b, b' ' | b'\t' | b'\r')) {
let mut j = 0usize;
while line.get(j).is_some_and(|b| matches!(*b, b' ' | b'\t')) {
j = j.saturating_add(1);
}
if line.get(j).is_some_and(u8::is_ascii_digit) {
while line.get(j).is_some_and(u8::is_ascii_digit) {
j = j.saturating_add(1);
}
if matches!(line.get(j), Some(b'.' | b')')) {
j = j.saturating_add(1);
} else {
return 0;
}
} else if matches!(line.get(j), Some(b'-' | b'*' | b'+')) {
j = j.saturating_add(1);
} else {
return 0;
}
if line.get(j) == Some(&b' ') {
j = j.saturating_add(1);
}
return u8::try_from(j).unwrap_or(u8::MAX);
}
if i >= bytes.len() {
return 0;
}
i = i.saturating_add(1);
}
}
fn compute_transparent_runs(
source: &str,
blockquote_ranges: &[Range<usize>],
list_item_ranges: &[(Range<usize>, u8)],
) -> Vec<Range<usize>> {
if blockquote_ranges.is_empty() && list_item_ranges.is_empty() {
return Vec::new();
}
let bytes = source.as_bytes();
let mut out: Vec<Range<usize>> = Vec::new();
let mut line_start = 0usize;
while line_start <= bytes.len() {
let line_end = bytes
.get(line_start..)
.and_then(|s| s.iter().position(|&b| b == b'\n'))
.map_or(bytes.len(), |n| line_start.saturating_add(n));
let mut cursor = line_start;
loop {
let mut spaces = 0usize;
while spaces < 3 && bytes.get(cursor.saturating_add(spaces)).copied() == Some(b' ') {
spaces = spaces.saturating_add(1);
}
let marker_pos = cursor.saturating_add(spaces);
if marker_pos < line_end
&& bytes.get(marker_pos).copied() == Some(b'>')
&& blockquote_ranges.iter().any(|r| r.start <= cursor && cursor < r.end)
{
cursor = marker_pos.saturating_add(1);
if cursor < line_end && bytes.get(cursor).copied() == Some(b' ') {
cursor = cursor.saturating_add(1);
}
continue;
}
let item_width = list_item_ranges
.iter()
.filter(|(r, _)| r.start < line_start && cursor < r.end)
.map(|(r, w)| (r.start, usize::from(*w)))
.max_by_key(|(s, _)| *s)
.map(|(_, w)| w);
if let Some(width) = item_width {
let mut consumed = 0usize;
while consumed < width
&& cursor.saturating_add(consumed) < line_end
&& bytes.get(cursor.saturating_add(consumed)).copied() == Some(b' ')
{
consumed = consumed.saturating_add(1);
}
if consumed > 0 {
cursor = cursor.saturating_add(consumed);
continue;
}
}
break;
}
if cursor > line_start {
out.push(line_start..cursor);
}
if line_end >= bytes.len() {
break;
}
line_start = line_end.saturating_add(1);
}
out
}
fn trim_heading(raw: &str) -> (&str, usize) {
let body = raw.strip_suffix('\n').unwrap_or(raw);
let body = body.split_once('\n').map_or(body, |(first, _)| first);
let lead_hashes = body.bytes().take_while(|&b| b == b'#').count();
let after_hashes = body.get(lead_hashes..).unwrap_or("");
let lead_ws = after_hashes.bytes().take_while(|&b| b == b' ' || b == b'\t').count();
let inner_start = lead_hashes.saturating_add(lead_ws);
let inner = body.get(inner_start..).unwrap_or("");
let trail_ws = inner.bytes().rev().take_while(|&b| b == b' ' || b == b'\t').count();
let after_trail_ws = inner.len().saturating_sub(trail_ws);
let no_trail_ws = inner.get(..after_trail_ws).unwrap_or("");
let trail_hashes = no_trail_ws.bytes().rev().take_while(|&b| b == b'#').count();
let after_trail_hashes = no_trail_ws.len().saturating_sub(trail_hashes);
let no_trail_hashes = no_trail_ws.get(..after_trail_hashes).unwrap_or("");
let final_trail = no_trail_hashes
.bytes()
.rev()
.take_while(|&b| b == b' ' || b == b'\t')
.count();
let final_end = no_trail_hashes.len().saturating_sub(final_trail);
let text = no_trail_hashes.get(..final_end).unwrap_or("");
(text, inner_start)
}
fn split_frontmatter(source: &str) -> (usize, Option<Frontmatter>) {
let first_line_end = source.find('\n');
let first_line = first_line_end.map_or(source, |n| source.get(..n).unwrap_or(""));
let trimmed_first = first_line.trim_end();
let delimiter = match trimmed_first {
"---" => FrontmatterDelimiter::Yaml,
"+++" => FrontmatterDelimiter::Toml,
_ => return (0, None),
};
let body_start = first_line_end.map_or(source.len(), |n| n.saturating_add(1));
let Some(rest) = source.get(body_start..) else {
return (0, None);
};
let mut cursor = 0usize;
while cursor < rest.len() {
let nl = rest
.get(cursor..)
.and_then(|s| s.find('\n'))
.unwrap_or_else(|| rest.len().saturating_sub(cursor));
let end_excl = cursor.saturating_add(nl);
let line = rest.get(cursor..end_excl).unwrap_or("");
let trimmed = line.trim_end();
let is_close = match delimiter {
FrontmatterDelimiter::Yaml => trimmed == "---" || trimmed == "...",
FrontmatterDelimiter::Toml => trimmed == "+++",
};
if is_close {
let body_text = rest.get(..end_excl).unwrap_or("");
if !frontmatter_body_has_key(body_text, delimiter) {
return (0, None);
}
let total = body_start.saturating_add(end_excl).saturating_add(1).min(source.len());
let text = source.get(0..total).unwrap_or("");
return (
total,
Some(Frontmatter {
slice: TextSlice {
text: text.to_owned(),
byte_offset: 0,
raw_range: 0..total,
},
delimiter,
}),
);
}
cursor = end_excl.saturating_add(1);
}
let _ = delimiter;
(0, None)
}
fn frontmatter_body_has_key(body: &str, delimiter: FrontmatterDelimiter) -> bool {
let key_byte = match delimiter {
FrontmatterDelimiter::Yaml => b':',
FrontmatterDelimiter::Toml => b'=',
};
body.lines().any(|line| line_has_key(line, key_byte))
}
fn line_has_key(line: &str, key_byte: u8) -> bool {
let bytes = line.as_bytes();
let mut i = 0usize;
while i < bytes.len() && matches!(bytes.get(i).copied(), Some(b' ' | b'\t')) {
i = i.saturating_add(1);
}
let start = i;
if !matches!(bytes.get(i).copied(), Some(b'a'..=b'z' | b'A'..=b'Z' | b'_')) {
return false;
}
i = i.saturating_add(1);
while i < bytes.len()
&& matches!(
bytes.get(i).copied(),
Some(b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' | b'.')
)
{
i = i.saturating_add(1);
}
if i == start {
return false;
}
while i < bytes.len() && matches!(bytes.get(i).copied(), Some(b' ' | b'\t')) {
i = i.saturating_add(1);
}
bytes.get(i).copied() == Some(key_byte)
}
fn suppression_regex() -> &'static Regex {
static RE: OnceLock<Regex> = OnceLock::new();
RE.get_or_init(|| {
compile_static(
r"^ {0,3}<!--\s*mdwright:\s*(?P<kind>allow-next-line|allow|disable-all|enable-all|disable|enable)(?:[ \t]+(?P<names>[\w\-,\s]+?))?\s*-->\s*$",
)
})
}
fn scan_suppressions(html_blocks: &[HtmlBlock]) -> Vec<Suppression> {
let mut out = Vec::new();
let re = suppression_regex();
for block in html_blocks {
let trimmed = block.text.trim_end();
let Some(caps) = re.captures(trimmed) else {
continue;
};
let Some(kind_match) = caps.name("kind") else {
continue;
};
let kind = match kind_match.as_str() {
"allow" => SuppressionKind::Allow {
scope: AllowScope::Block,
},
"allow-next-line" => SuppressionKind::Allow {
scope: AllowScope::NextLine,
},
"disable" | "disable-all" => SuppressionKind::Disable,
"enable" | "enable-all" => SuppressionKind::Enable,
_ => continue,
};
let rules: Vec<String> = caps
.name("names")
.map_or("", |m| m.as_str())
.split([',', ' ', '\t'])
.filter(|s| !s.is_empty())
.map(str::to_owned)
.collect();
if matches!(kind, SuppressionKind::Allow { .. }) && rules.is_empty() {
continue;
}
out.push(Suppression {
kind,
rules,
raw_range: block.raw_range.clone(),
});
}
out
}
#[cfg(test)]
#[allow(
clippy::indexing_slicing,
reason = "test asserts; panic surface is the test framework"
)]
mod tests {
use super::Ir;
fn some_ref<'a, T>(value: Option<&'a T>, label: &str) -> Result<&'a T, String> {
match value {
Some(value) => Ok(value),
None => Err(label.to_owned()),
}
}
#[test]
fn prose_chunks_include_backslash_escapes() {
let ir = Ir::parse_str(r"a \_b\_ c");
let texts: Vec<&str> = ir.prose_chunks.iter().map(|c| c.text.as_str()).collect();
assert!(
texts.iter().any(|t| t.contains(r"\_")),
"prose chunks should preserve `\\_`: {texts:?}"
);
}
#[test]
fn fenced_code_excluded_from_prose() {
let src = "before\n```\nx \\_y\\_ z\n```\nafter \\_outside\\_\n";
let ir = Ir::parse_str(src);
for c in &ir.prose_chunks {
assert!(!c.text.contains("\\_y"), "prose chunk leaked code body: {:?}", c.text);
}
let texts: Vec<&str> = ir.prose_chunks.iter().map(|c| c.text.as_str()).collect();
assert!(texts.iter().any(|t| t.contains("\\_")), "no chunk has `\\_`: {texts:?}");
assert!(
texts.iter().any(|t| t.contains("outside")),
"no chunk has `outside`: {texts:?}"
);
assert_eq!(ir.code_blocks.len(), 1);
}
#[test]
fn inline_code_strips_fences() -> Result<(), String> {
let ir = Ir::parse_str("see `foo_bar` here\n");
assert_eq!(ir.inline_codes.len(), 1);
let code = some_ref(ir.inline_codes.first(), "missing")?;
assert_eq!(code.text, "foo_bar");
Ok(())
}
#[test]
fn frontmatter_split() -> Result<(), String> {
let src = "---\ntitle: T\n---\nbody text\n";
let ir = Ir::parse_str(src);
let fm = some_ref(ir.frontmatter.as_ref(), "frontmatter")?;
assert_eq!(fm.delimiter, super::FrontmatterDelimiter::Yaml);
let body_chunks: Vec<&str> = ir.prose_chunks.iter().map(|c| c.text.as_str()).collect();
assert!(body_chunks.iter().any(|t| t == &"body text"));
Ok(())
}
#[test]
fn frontmatter_opener_without_close_is_thematic_break() -> Result<(), String> {
let src = "---\n\n- a\n- a\n\n- a\n";
let ir = Ir::parse_str(src);
assert!(ir.frontmatter.is_none(), "no frontmatter without close");
let any_a = ir.prose_chunks.iter().any(|c| c.text == "a");
assert!(
any_a,
"body markdown should be parsed as prose, got {:?}",
ir.prose_chunks
);
Ok(())
}
#[test]
fn frontmatter_toml_split() -> Result<(), String> {
let src = "+++\ntitle = \"T\"\n+++\nbody text\n";
let ir = Ir::parse_str(src);
let fm = some_ref(ir.frontmatter.as_ref(), "frontmatter")?;
assert_eq!(fm.delimiter, super::FrontmatterDelimiter::Toml);
let body_chunks: Vec<&str> = ir.prose_chunks.iter().map(|c| c.text.as_str()).collect();
assert!(body_chunks.iter().any(|t| t == &"body text"));
Ok(())
}
#[test]
fn headings_trimmed_and_levelled() {
let ir = Ir::parse_str("# One\n\n## Two ##\n\n### Three\n");
assert_eq!(ir.headings.len(), 3);
let texts: Vec<(&str, u32)> = ir.headings.iter().map(|h| (h.text.as_str(), h.level)).collect();
assert_eq!(texts, vec![("One", 1), ("Two", 2), ("Three", 3)]);
}
#[test]
fn list_groups_record_markers() -> Result<(), String> {
let src = "- one\n- two\n* three\n";
let ir = Ir::parse_str(src);
assert_eq!(ir.list_groups.len(), 2);
let g1 = some_ref(ir.list_groups.first(), "first list")?;
assert!(!g1.ordered);
let markers: Vec<u8> = g1.items.iter().map(|i| i.marker_byte).collect();
assert_eq!(markers, vec![b'-', b'-']);
let g2 = some_ref(ir.list_groups.get(1), "second list")?;
let item = some_ref(g2.items.first(), "item")?;
assert_eq!(item.marker_byte, b'*');
Ok(())
}
#[test]
fn link_defs_scanned() -> Result<(), String> {
let src = "[bar]: https://example.com\n\nSee [ref][bar].\n";
let ir = Ir::parse_str(src);
let target = some_ref(ir.refs.iter().next(), "expected one target")?;
assert_eq!(target.label_raw, "bar");
assert_eq!(target.dest, "https://example.com");
Ok(())
}
#[test]
fn link_defs_skipped_inside_code_block() {
let src = "```\n[bar]: https://example.com\n```\n";
let ir = Ir::parse_str(src);
assert!(ir.refs.is_empty());
}
#[test]
fn inline_html_collected() {
let src = "before <span>x</span> after\n";
let ir = Ir::parse_str(src);
assert!(ir.inline_html.iter().any(|h| h.text == "<span>"));
assert!(ir.inline_html.iter().any(|h| h.text == "</span>"));
}
#[test]
fn code_block_info_string() -> Result<(), String> {
let src = "```rust\nfn x() {}\n```\n";
let ir = Ir::parse_str(src);
assert_eq!(ir.code_blocks.len(), 1);
let cb = some_ref(ir.code_blocks.first(), "cb")?;
assert_eq!(cb.info, "rust");
assert!(cb.fenced);
Ok(())
}
use super::{AllowScope, SuppressionKind};
#[test]
fn suppression_allow_parses() -> Result<(), String> {
let src = "<!-- mdwright: allow heading-punctuation -->\n# Title.\n";
let ir = Ir::parse_str(src);
assert_eq!(ir.suppressions.len(), 1);
let s = some_ref(ir.suppressions.first(), "first")?;
assert_eq!(
s.kind,
SuppressionKind::Allow {
scope: AllowScope::Block
}
);
assert_eq!(s.rules, vec!["heading-punctuation"]);
Ok(())
}
#[test]
fn suppression_allow_next_line_parses() -> Result<(), String> {
let src = "<!-- mdwright: allow-next-line trailing-whitespace -->\nfoo \n";
let ir = Ir::parse_str(src);
let s = some_ref(ir.suppressions.first(), "first")?;
assert_eq!(
s.kind,
SuppressionKind::Allow {
scope: AllowScope::NextLine
}
);
Ok(())
}
#[test]
fn suppression_multiple_rules_parses() -> Result<(), String> {
let src = "<!-- mdwright: allow rule-a, rule-b, rule-c -->\nbody\n";
let ir = Ir::parse_str(src);
let s = some_ref(ir.suppressions.first(), "first")?;
assert_eq!(s.rules, vec!["rule-a", "rule-b", "rule-c"]);
Ok(())
}
#[test]
fn suppression_disable_enable_parse() -> Result<(), String> {
let src = "<!-- mdwright: disable bare-url -->\n\nfoo\n\n<!-- mdwright: enable bare-url -->\n";
let ir = Ir::parse_str(src);
assert_eq!(ir.suppressions.len(), 2);
let first = some_ref(ir.suppressions.first(), "first")?;
let second = some_ref(ir.suppressions.get(1), "second")?;
assert_eq!(first.kind, SuppressionKind::Disable);
assert_eq!(second.kind, SuppressionKind::Enable);
Ok(())
}
#[test]
fn suppression_disable_all_alias_parses() -> Result<(), String> {
let src = "<!-- mdwright: disable-all -->\nfoo\n";
let ir = Ir::parse_str(src);
let s = some_ref(ir.suppressions.first(), "first")?;
assert_eq!(s.kind, SuppressionKind::Disable);
assert!(s.rules.is_empty());
Ok(())
}
#[test]
fn suppression_bare_allow_rejected() {
let src = "<!-- mdwright: allow -->\n# Title\n";
let ir = Ir::parse_str(src);
assert!(ir.suppressions.is_empty());
}
#[test]
fn suppression_inline_html_ignored() {
let src = "Some text <!-- mdwright: allow bare-url --> more text.\n";
let ir = Ir::parse_str(src);
assert!(ir.suppressions.is_empty());
}
#[test]
fn suppression_with_indent_parses() -> Result<(), String> {
let src = " <!-- mdwright: allow heading-punctuation -->\n# Title.\n";
let ir = Ir::parse_str(src);
let s = some_ref(ir.suppressions.first(), "first")?;
assert_eq!(s.rules, vec!["heading-punctuation"]);
Ok(())
}
use super::compute_transparent_runs;
#[test]
fn transparent_runs_for_blockquote_continuation() {
let src = "> a\n> b\n";
let bq = 0..src.len();
let runs = compute_transparent_runs(src, std::slice::from_ref(&bq), &[]);
assert_eq!(runs, vec![0..2, 4..6]);
}
#[test]
fn transparent_runs_for_nested_blockquote() {
let src = "> > a\n> > b\n";
let outer = 0..src.len();
let inner = 2..src.len();
let runs = compute_transparent_runs(src, &[outer, inner], &[]);
assert_eq!(runs, vec![0..4, 6..10]);
}
#[test]
fn transparent_runs_for_list_item_continuation() {
let src = "1. a\n b\n";
let item = (0..src.len(), 3);
let runs = compute_transparent_runs(src, &[], &[item]);
assert_eq!(runs, vec![5..8]);
}
#[test]
fn transparent_runs_empty_for_plain_paragraph() {
let src = "hello\nworld\n";
let runs = compute_transparent_runs(src, &[], &[]);
assert!(runs.is_empty(), "expected empty: {runs:?}");
}
}