1use std::ops::Range;
23use std::sync::OnceLock;
24
25use pulldown_cmark::{CodeBlockKind, Event, Tag, TagEnd};
26use regex::Regex;
27
28use crate::format_facts::FormatFacts;
29use crate::gfm::{AutolinkFact, collect_autolinks};
30use crate::line_index::LineIndex;
31use crate::parse;
32use crate::refs::{ReferenceTable, build_reference_table};
33use crate::source::{CanonicalSource, Source};
34use crate::tree::TreeBuilder;
35use crate::util::regex::compile_static;
36use crate::{ParseError, ParseOptions};
37use mdwright_math::{MathError, MathRegion, scan_math_regions};
38
39#[derive(Clone, Debug)]
42pub struct TextSlice {
43 pub text: String,
44 pub byte_offset: usize,
45 pub raw_range: Range<usize>,
46}
47
48#[derive(Clone, Debug)]
51pub struct InlineCode {
52 pub text: String,
53 pub byte_offset: usize,
54 pub raw_range: Range<usize>,
55}
56
57#[derive(Clone, Debug)]
63pub struct CodeBlock {
64 pub text: String,
65 pub byte_offset: usize,
66 pub raw_range: Range<usize>,
67 pub info: String,
68 pub fenced: bool,
69}
70
71#[derive(Clone, Debug)]
73pub struct HtmlBlock {
74 pub text: String,
75 pub byte_offset: usize,
76 pub raw_range: Range<usize>,
77}
78
79#[derive(Clone, Debug)]
82pub struct InlineHtml {
83 pub text: String,
84 pub byte_offset: usize,
85 pub raw_range: Range<usize>,
86}
87
88#[derive(Clone, Debug)]
92pub struct Heading {
93 pub text: String,
94 pub byte_offset: usize,
95 pub raw_range: Range<usize>,
96 pub level: u32,
98}
99
100#[derive(Clone, Debug)]
103pub struct ListGroup {
104 pub raw_range: Range<usize>,
105 pub ordered: bool,
106 pub items: Vec<ListItem>,
107}
108
109#[derive(Clone, Debug)]
111pub struct ListItem {
112 pub raw_range: Range<usize>,
113 pub marker_byte: u8,
116}
117
118#[derive(Clone, Debug)]
122pub struct Frontmatter {
123 pub slice: TextSlice,
124 pub delimiter: FrontmatterDelimiter,
125}
126
127#[derive(Copy, Clone, Debug, PartialEq, Eq)]
130pub enum FrontmatterDelimiter {
131 Yaml,
132 Toml,
133}
134
135#[derive(Clone, Debug)]
142pub struct LinkDef<'a> {
143 pub label: &'a str,
144 pub dest: &'a str,
145 pub title: Option<&'a str>,
148 pub raw_range: Range<usize>,
149}
150
151#[derive(Clone, Debug)]
172pub struct Suppression {
173 pub kind: SuppressionKind,
174 pub rules: Vec<String>,
178 pub raw_range: Range<usize>,
179}
180
181#[derive(Copy, Clone, Debug)]
183pub struct BlockCheckpointFact {
184 pub byte: u32,
185 pub parser_state: u64,
186}
187
188#[derive(Copy, Clone, Debug, PartialEq, Eq)]
189pub enum SuppressionKind {
190 Allow { scope: AllowScope },
191 Disable,
192 Enable,
193}
194
195#[derive(Copy, Clone, Debug, PartialEq, Eq)]
196pub enum AllowScope {
197 Block,
199 NextLine,
201}
202
203#[derive(Debug)]
206pub(crate) struct Ir {
207 pub(crate) prose_chunks: Vec<TextSlice>,
208 pub(crate) autolinks: Vec<AutolinkFact>,
209 pub(crate) inline_codes: Vec<InlineCode>,
210 pub(crate) code_blocks: Vec<CodeBlock>,
211 pub(crate) html_blocks: Vec<HtmlBlock>,
212 pub(crate) inline_html: Vec<InlineHtml>,
213 pub(crate) headings: Vec<Heading>,
214 pub(crate) list_groups: Vec<ListGroup>,
215 pub(crate) refs: ReferenceTable,
216 pub(crate) suppressions: Vec<Suppression>,
217 pub(crate) frontmatter: Option<Frontmatter>,
218 pub(crate) math_regions: Vec<MathRegion>,
219 pub(crate) math_errors: Vec<MathError>,
220 pub(crate) line_index: LineIndex,
221 #[cfg(test)]
222 pub(crate) tree: crate::tree::Tree,
223 pub(crate) list_tightness: Vec<(usize, bool)>,
224 pub(crate) link_like_ranges: Vec<Range<usize>>,
225 pub(crate) block_checkpoints: Vec<BlockCheckpointFact>,
226 pub(crate) format_facts: FormatFacts,
227}
228
229impl Ir {
230 #[tracing::instrument(level = "info", name = "Ir::parse", skip(src), fields(len = src.canonical().len()))]
231 pub(crate) fn parse(src: &Source, opts: ParseOptions) -> Result<Self, ParseError> {
232 let canonical_src = CanonicalSource::from_source(src);
233 let source = canonical_src.as_str();
234 let line_index = LineIndex::new(source);
235 let (fm_end, frontmatter) = split_frontmatter(source);
236 let body = canonical_src.trusted_subrange(fm_end..source.len());
237
238 let mut builder = Builder {
239 source,
240 in_code_block: 0,
241 heading_stack: Vec::new(),
242 list_stack: Vec::new(),
243 code_block_stack: Vec::new(),
244 blockquote_stack: Vec::new(),
245 blockquote_ranges: Vec::new(),
246 list_item_ranges: Vec::new(),
247 prose_chunks: Vec::new(),
248 inline_codes: Vec::new(),
249 code_blocks: Vec::new(),
250 html_blocks: Vec::new(),
251 inline_html: Vec::new(),
252 headings: Vec::new(),
253 list_groups: Vec::new(),
254 };
255 let events: Vec<(Event<'_>, Range<usize>)> = parse::collect_events_with_offsets(body, parse::options(opts))?
263 .into_iter()
264 .map(|(e, r)| {
265 let abs = r.start.saturating_add(fm_end)..r.end.saturating_add(fm_end);
266 (e, abs)
267 })
268 .collect();
269 let block_checkpoints = build_block_checkpoints(source, &events);
270 for (event, abs) in &events {
271 builder.handle(event.clone(), abs.clone());
272 }
273 tracing::debug!(events = events.len(), "flat-IR walk complete");
274
275 let transparent_runs = compute_transparent_runs(source, &builder.blockquote_ranges, &builder.list_item_ranges);
282 let math_exclusions: Vec<Range<usize>> = builder
283 .inline_codes
284 .iter()
285 .map(|c| c.raw_range.clone())
286 .chain(builder.code_blocks.iter().map(|c| c.raw_range.clone()))
287 .chain(builder.html_blocks.iter().map(|h| h.raw_range.clone()))
288 .chain(builder.inline_html.iter().map(|h| h.raw_range.clone()))
289 .collect();
290 let (math_regions, math_errors) = scan_math_regions(
291 source,
292 &math_exclusions,
293 &transparent_runs,
294 opts.math().scanner_config(),
295 );
296
297 let mut tree_builder = TreeBuilder::new(source, &math_regions);
298 for (event, abs) in &events {
299 tree_builder.handle(event, abs.clone());
300 }
301 tracing::debug!(nodes = tree_builder.arena_len(), "tree walk complete");
302
303 let autolinks = collect_autolinks(source, &events, opts.extensions().gfm);
304 let bare_events: Vec<Event<'_>> = events.iter().map(|(e, _)| e.clone()).collect();
305 let refs = build_reference_table(&bare_events, source);
306 let suppressions = scan_suppressions(&builder.html_blocks);
307 let tree = tree_builder.finalize(&refs);
308 let list_tightness = tree.list_tightness_by_start();
309 let link_like_ranges = tree.link_like_ranges();
310 let format_facts = FormatFacts::from_parts(
311 source,
312 &events,
313 &autolinks,
314 &math_regions,
315 &builder.code_blocks,
316 &builder.html_blocks,
317 &tree,
318 );
319
320 Ok(Self {
321 prose_chunks: builder.prose_chunks,
322 autolinks,
323 inline_codes: builder.inline_codes,
324 code_blocks: builder.code_blocks,
325 html_blocks: builder.html_blocks,
326 inline_html: builder.inline_html,
327 headings: builder.headings,
328 list_groups: builder.list_groups,
329 refs,
330 suppressions,
331 frontmatter,
332 math_regions,
333 math_errors,
334 line_index,
335 #[cfg(test)]
336 tree,
337 list_tightness,
338 link_like_ranges,
339 block_checkpoints,
340 format_facts,
341 })
342 }
343
344 pub(crate) fn line_index(&self) -> &LineIndex {
345 &self.line_index
346 }
347
348 #[cfg(test)]
356 #[allow(clippy::expect_used, reason = "test helper rejects invalid fixtures")]
357 pub(crate) fn parse_str(src: &str) -> Self {
358 let source = crate::source::Source::new(src);
359 Self::parse(&source, crate::ParseOptions::default()).expect("test Markdown parses")
360 }
361}
362
363fn build_block_checkpoints(source: &str, events: &[(Event<'_>, Range<usize>)]) -> Vec<BlockCheckpointFact> {
364 let source_len = u32::try_from(source.len()).unwrap_or(u32::MAX);
365 let cap = (source.len() / 64).saturating_add(2);
366 let mut points = Vec::with_capacity(cap);
367 points.push(BlockCheckpointFact {
368 byte: 0,
369 parser_state: 0,
370 });
371
372 let mut depth: u32 = 0;
373 let mut event_count: u32 = 0;
374 let try_push = |points: &mut Vec<BlockCheckpointFact>, range_start: usize, depth: u32, event_count: u32| {
375 let byte = u32::try_from(range_start).unwrap_or(u32::MAX);
376 if points.last().is_none_or(|last| last.byte < byte) {
377 points.push(BlockCheckpointFact {
378 byte,
379 parser_state: parser_state_hash(depth, event_count),
380 });
381 }
382 };
383 for (event, range) in events {
384 event_count = event_count.saturating_add(1);
385 walk_checkpoint_event(
386 event.clone(),
387 range.start,
388 &mut depth,
389 event_count,
390 &mut points,
391 &try_push,
392 );
393 }
394 if points.last().is_none_or(|last| last.byte < source_len) {
395 points.push(BlockCheckpointFact {
396 byte: source_len,
397 parser_state: parser_state_hash(depth, event_count),
398 });
399 }
400 points
401}
402
403fn walk_checkpoint_event(
404 event: Event<'_>,
405 range_start: usize,
406 depth: &mut u32,
407 event_count: u32,
408 points: &mut Vec<BlockCheckpointFact>,
409 try_push: &impl Fn(&mut Vec<BlockCheckpointFact>, usize, u32, u32),
410) {
411 match event {
412 Event::Start(tag) if *depth == 0 && is_top_level_block(&tag) => {
413 try_push(points, range_start, *depth, event_count);
414 if is_container(&tag) {
415 *depth = depth.saturating_add(1);
416 }
417 }
418 Event::Start(tag) if is_container(&tag) => {
419 *depth = depth.saturating_add(1);
420 }
421 Event::End(end) if is_container_end(end) => {
422 *depth = depth.saturating_sub(1);
423 }
424 Event::Rule if *depth == 0 => {
425 try_push(points, range_start, *depth, event_count);
426 }
427 Event::Start(_)
428 | Event::End(_)
429 | Event::Text(_)
430 | Event::Code(_)
431 | Event::InlineMath(_)
432 | Event::DisplayMath(_)
433 | Event::Html(_)
434 | Event::InlineHtml(_)
435 | Event::FootnoteReference(_)
436 | Event::SoftBreak
437 | Event::HardBreak
438 | Event::Rule
439 | Event::TaskListMarker(_) => {}
440 }
441}
442
443fn is_top_level_block(tag: &Tag<'_>) -> bool {
444 matches!(
445 tag,
446 Tag::Paragraph
447 | Tag::Heading { .. }
448 | Tag::BlockQuote(_)
449 | Tag::CodeBlock(_)
450 | Tag::HtmlBlock
451 | Tag::List(_)
452 | Tag::Table(_)
453 | Tag::FootnoteDefinition(_)
454 )
455}
456
457fn is_container(tag: &Tag<'_>) -> bool {
458 matches!(
459 tag,
460 Tag::BlockQuote(_)
461 | Tag::List(_)
462 | Tag::Item
463 | Tag::FootnoteDefinition(_)
464 | Tag::Table(_)
465 | Tag::TableHead
466 | Tag::TableRow
467 | Tag::TableCell
468 )
469}
470
471fn is_container_end(end: TagEnd) -> bool {
472 matches!(
473 end,
474 TagEnd::BlockQuote(_)
475 | TagEnd::List(_)
476 | TagEnd::Item
477 | TagEnd::FootnoteDefinition
478 | TagEnd::Table
479 | TagEnd::TableHead
480 | TagEnd::TableRow
481 | TagEnd::TableCell
482 )
483}
484
485fn parser_state_hash(depth: u32, event_count: u32) -> u64 {
486 (u64::from(depth) << 32) | u64::from(event_count)
487}
488
489struct Builder<'a> {
492 source: &'a str,
493 in_code_block: u32,
494 heading_stack: Vec<(usize, u32)>,
496 list_stack: Vec<OpenList>,
499 code_block_stack: Vec<(usize, String, bool)>,
501 blockquote_stack: Vec<usize>,
505 blockquote_ranges: Vec<Range<usize>>,
509 list_item_ranges: Vec<(Range<usize>, u8)>,
513 prose_chunks: Vec<TextSlice>,
514 inline_codes: Vec<InlineCode>,
515 code_blocks: Vec<CodeBlock>,
516 html_blocks: Vec<HtmlBlock>,
517 inline_html: Vec<InlineHtml>,
518 headings: Vec<Heading>,
519 list_groups: Vec<ListGroup>,
520}
521
522struct OpenList {
523 start: usize,
524 ordered: bool,
525 items: Vec<ListItem>,
526}
527
528impl Builder<'_> {
529 #[allow(clippy::wildcard_enum_match_arm)] fn handle(&mut self, event: Event<'_>, range: Range<usize>) {
531 match event {
532 Event::Start(tag) => self.start(tag, range),
533 Event::End(tag) => self.end(tag, range),
534 Event::Text(_) => self.push_prose(range),
535 Event::Code(_) => self.push_inline_code(range),
536 Event::Html(_) => self.push_html_block(range),
537 Event::InlineHtml(_) => self.push_inline_html(range),
538 _ => {}
543 }
544 }
545
546 #[allow(clippy::wildcard_enum_match_arm)] fn start(&mut self, tag: Tag<'_>, range: Range<usize>) {
548 match tag {
549 Tag::Heading { level, .. } => {
550 self.heading_stack.push((range.start, level as u32));
551 }
552 Tag::CodeBlock(kind) => {
553 self.in_code_block = self.in_code_block.saturating_add(1);
554 let (info, fenced) = match kind {
555 CodeBlockKind::Fenced(s) => (s.into_string(), true),
556 CodeBlockKind::Indented => (String::new(), false),
557 };
558 self.code_block_stack.push((range.start, info, fenced));
559 }
560 Tag::List(start) => {
561 self.list_stack.push(OpenList {
562 start: range.start,
563 ordered: start.is_some(),
564 items: Vec::new(),
565 });
566 }
567 Tag::Item => {
568 let ordered = self.list_stack.last().is_some_and(|l| l.ordered);
573 let marker_byte = derive_item_marker_byte(self.source, range.clone(), ordered).unwrap_or(b'-');
574 let indent = item_continuation_width(self.source, &range);
575 self.list_item_ranges.push((range.clone(), indent));
576 if let Some(open) = self.list_stack.last_mut() {
577 open.items.push(ListItem {
578 raw_range: range,
579 marker_byte,
580 });
581 }
582 }
583 Tag::BlockQuote(_) => {
584 self.blockquote_stack.push(range.start);
585 }
586 #[allow(clippy::wildcard_enum_match_arm)]
587 _ => {}
588 }
589 }
590
591 #[allow(clippy::wildcard_enum_match_arm)] fn end(&mut self, tag: TagEnd, range: Range<usize>) {
593 match tag {
594 TagEnd::Heading(_) => {
595 if let Some((start, level)) = self.heading_stack.pop() {
596 let end = range.end;
597 let raw = self.source.get(start..end).unwrap_or("");
598 let (trimmed, off) = trim_heading(raw);
599 self.headings.push(Heading {
600 text: trimmed.to_owned(),
601 byte_offset: start.saturating_add(off),
602 raw_range: start..end,
603 level,
604 });
605 }
606 }
607 TagEnd::CodeBlock => {
608 self.in_code_block = self.in_code_block.saturating_sub(1);
609 if let Some((start, info, fenced)) = self.code_block_stack.pop() {
610 let end = range.end;
611 let raw = self.source.get(start..end).unwrap_or("");
612 self.code_blocks.push(CodeBlock {
613 text: raw.to_owned(),
614 byte_offset: start,
615 raw_range: start..end,
616 info,
617 fenced,
618 });
619 }
620 }
621 TagEnd::List(_) => {
622 if let Some(open) = self.list_stack.pop() {
623 self.list_groups.push(ListGroup {
624 raw_range: open.start..range.end,
625 ordered: open.ordered,
626 items: open.items,
627 });
628 }
629 }
630 TagEnd::BlockQuote(_) => {
631 if let Some(start) = self.blockquote_stack.pop() {
632 self.blockquote_ranges.push(start..range.end);
633 }
634 }
635 #[allow(clippy::wildcard_enum_match_arm)]
636 _ => {}
637 }
638 }
639
640 fn push_prose(&mut self, range: Range<usize>) {
641 if self.in_code_block > 0 {
642 return;
643 }
644 let bytes = self.source.as_bytes();
648 let start = if range.start > 0 && bytes.get(range.start.saturating_sub(1)) == Some(&b'\\') {
649 range.start.saturating_sub(1)
650 } else {
651 range.start
652 };
653 let end = range.end;
654 let Some(text) = self.source.get(start..end) else {
655 return;
656 };
657 self.prose_chunks.push(TextSlice {
658 text: text.to_owned(),
659 byte_offset: start,
660 raw_range: start..end,
661 });
662 }
663
664 fn push_inline_code(&mut self, range: Range<usize>) {
665 let raw = self.source.get(range.clone()).unwrap_or("");
666 let lead = raw.bytes().take_while(|&b| b == b'`').count();
667 let trail = raw.bytes().rev().take_while(|&b| b == b'`').count();
668 let (content_start, content_end) = if lead == 0 || trail == 0 || lead.saturating_add(trail) >= raw.len() {
669 (range.start, range.end)
670 } else {
671 (range.start.saturating_add(lead), range.end.saturating_sub(trail))
672 };
673 let Some(text) = self.source.get(content_start..content_end) else {
674 return;
675 };
676 self.inline_codes.push(InlineCode {
677 text: text.to_owned(),
678 byte_offset: content_start,
679 raw_range: range,
680 });
681 }
682
683 fn push_html_block(&mut self, range: Range<usize>) {
684 let Some(text) = self.source.get(range.clone()) else {
685 return;
686 };
687 self.html_blocks.push(HtmlBlock {
688 text: text.to_owned(),
689 byte_offset: range.start,
690 raw_range: range,
691 });
692 }
693
694 fn push_inline_html(&mut self, range: Range<usize>) {
695 let Some(text) = self.source.get(range.clone()) else {
696 return;
697 };
698 self.inline_html.push(InlineHtml {
699 text: text.to_owned(),
700 byte_offset: range.start,
701 raw_range: range,
702 });
703 }
704}
705
706fn derive_item_marker_byte(source: &str, range: core::ops::Range<usize>, ordered: bool) -> Option<u8> {
717 source.as_bytes().get(range)?.iter().copied().find(|b| {
718 if ordered {
719 b.is_ascii_digit()
720 } else {
721 matches!(b, b'-' | b'*' | b'+')
722 }
723 })
724}
725
726fn item_continuation_width(source: &str, raw_range: &Range<usize>) -> u8 {
737 let bytes = source.as_bytes().get(raw_range.clone()).unwrap_or(&[]);
738 let mut i = 0usize;
739 loop {
740 let line_start = i;
741 while bytes.get(i).is_some_and(|&b| b != b'\n') {
742 i = i.saturating_add(1);
743 }
744 let line = bytes.get(line_start..i).unwrap_or(&[]);
745 if line.iter().any(|b| !matches!(*b, b' ' | b'\t' | b'\r')) {
746 let mut j = 0usize;
747 while line.get(j).is_some_and(|b| matches!(*b, b' ' | b'\t')) {
748 j = j.saturating_add(1);
749 }
750 if line.get(j).is_some_and(u8::is_ascii_digit) {
751 while line.get(j).is_some_and(u8::is_ascii_digit) {
752 j = j.saturating_add(1);
753 }
754 if matches!(line.get(j), Some(b'.' | b')')) {
755 j = j.saturating_add(1);
756 } else {
757 return 0;
758 }
759 } else if matches!(line.get(j), Some(b'-' | b'*' | b'+')) {
760 j = j.saturating_add(1);
761 } else {
762 return 0;
763 }
764 if line.get(j) == Some(&b' ') {
765 j = j.saturating_add(1);
766 }
767 return u8::try_from(j).unwrap_or(u8::MAX);
768 }
769 if i >= bytes.len() {
770 return 0;
771 }
772 i = i.saturating_add(1);
773 }
774}
775
776fn compute_transparent_runs(
785 source: &str,
786 blockquote_ranges: &[Range<usize>],
787 list_item_ranges: &[(Range<usize>, u8)],
788) -> Vec<Range<usize>> {
789 if blockquote_ranges.is_empty() && list_item_ranges.is_empty() {
790 return Vec::new();
791 }
792 let bytes = source.as_bytes();
793 let mut out: Vec<Range<usize>> = Vec::new();
794 let mut line_start = 0usize;
795 while line_start <= bytes.len() {
796 let line_end = bytes
797 .get(line_start..)
798 .and_then(|s| s.iter().position(|&b| b == b'\n'))
799 .map_or(bytes.len(), |n| line_start.saturating_add(n));
800 let mut cursor = line_start;
801 loop {
802 let mut spaces = 0usize;
806 while spaces < 3 && bytes.get(cursor.saturating_add(spaces)).copied() == Some(b' ') {
807 spaces = spaces.saturating_add(1);
808 }
809 let marker_pos = cursor.saturating_add(spaces);
810 if marker_pos < line_end
811 && bytes.get(marker_pos).copied() == Some(b'>')
812 && blockquote_ranges.iter().any(|r| r.start <= cursor && cursor < r.end)
813 {
814 cursor = marker_pos.saturating_add(1);
815 if cursor < line_end && bytes.get(cursor).copied() == Some(b' ') {
816 cursor = cursor.saturating_add(1);
817 }
818 continue;
819 }
820 let item_width = list_item_ranges
824 .iter()
825 .filter(|(r, _)| r.start < line_start && cursor < r.end)
826 .map(|(r, w)| (r.start, usize::from(*w)))
827 .max_by_key(|(s, _)| *s)
828 .map(|(_, w)| w);
829 if let Some(width) = item_width {
830 let mut consumed = 0usize;
831 while consumed < width
832 && cursor.saturating_add(consumed) < line_end
833 && bytes.get(cursor.saturating_add(consumed)).copied() == Some(b' ')
834 {
835 consumed = consumed.saturating_add(1);
836 }
837 if consumed > 0 {
838 cursor = cursor.saturating_add(consumed);
839 continue;
840 }
841 }
842 break;
843 }
844 if cursor > line_start {
845 out.push(line_start..cursor);
846 }
847 if line_end >= bytes.len() {
848 break;
849 }
850 line_start = line_end.saturating_add(1);
851 }
852 out
853}
854
855fn trim_heading(raw: &str) -> (&str, usize) {
861 let body = raw.strip_suffix('\n').unwrap_or(raw);
862 let body = body.split_once('\n').map_or(body, |(first, _)| first);
863 let lead_hashes = body.bytes().take_while(|&b| b == b'#').count();
864 let after_hashes = body.get(lead_hashes..).unwrap_or("");
865 let lead_ws = after_hashes.bytes().take_while(|&b| b == b' ' || b == b'\t').count();
866 let inner_start = lead_hashes.saturating_add(lead_ws);
867 let inner = body.get(inner_start..).unwrap_or("");
868 let trail_ws = inner.bytes().rev().take_while(|&b| b == b' ' || b == b'\t').count();
869 let after_trail_ws = inner.len().saturating_sub(trail_ws);
870 let no_trail_ws = inner.get(..after_trail_ws).unwrap_or("");
871 let trail_hashes = no_trail_ws.bytes().rev().take_while(|&b| b == b'#').count();
872 let after_trail_hashes = no_trail_ws.len().saturating_sub(trail_hashes);
873 let no_trail_hashes = no_trail_ws.get(..after_trail_hashes).unwrap_or("");
874 let final_trail = no_trail_hashes
875 .bytes()
876 .rev()
877 .take_while(|&b| b == b' ' || b == b'\t')
878 .count();
879 let final_end = no_trail_hashes.len().saturating_sub(final_trail);
880 let text = no_trail_hashes.get(..final_end).unwrap_or("");
881 (text, inner_start)
882}
883
884fn split_frontmatter(source: &str) -> (usize, Option<Frontmatter>) {
893 let first_line_end = source.find('\n');
894 let first_line = first_line_end.map_or(source, |n| source.get(..n).unwrap_or(""));
895 let trimmed_first = first_line.trim_end();
896 let delimiter = match trimmed_first {
897 "---" => FrontmatterDelimiter::Yaml,
898 "+++" => FrontmatterDelimiter::Toml,
899 _ => return (0, None),
900 };
901 let body_start = first_line_end.map_or(source.len(), |n| n.saturating_add(1));
902 let Some(rest) = source.get(body_start..) else {
903 return (0, None);
904 };
905 let mut cursor = 0usize;
906 while cursor < rest.len() {
907 let nl = rest
908 .get(cursor..)
909 .and_then(|s| s.find('\n'))
910 .unwrap_or_else(|| rest.len().saturating_sub(cursor));
911 let end_excl = cursor.saturating_add(nl);
912 let line = rest.get(cursor..end_excl).unwrap_or("");
913 let trimmed = line.trim_end();
914 let is_close = match delimiter {
915 FrontmatterDelimiter::Yaml => trimmed == "---" || trimmed == "...",
916 FrontmatterDelimiter::Toml => trimmed == "+++",
917 };
918 if is_close {
919 let body_text = rest.get(..end_excl).unwrap_or("");
928 if !frontmatter_body_has_key(body_text, delimiter) {
929 return (0, None);
930 }
931 let total = body_start.saturating_add(end_excl).saturating_add(1).min(source.len());
932 let text = source.get(0..total).unwrap_or("");
933 return (
934 total,
935 Some(Frontmatter {
936 slice: TextSlice {
937 text: text.to_owned(),
938 byte_offset: 0,
939 raw_range: 0..total,
940 },
941 delimiter,
942 }),
943 );
944 }
945 cursor = end_excl.saturating_add(1);
946 }
947 let _ = delimiter;
955 (0, None)
956}
957
958fn frontmatter_body_has_key(body: &str, delimiter: FrontmatterDelimiter) -> bool {
963 let key_byte = match delimiter {
964 FrontmatterDelimiter::Yaml => b':',
965 FrontmatterDelimiter::Toml => b'=',
966 };
967 body.lines().any(|line| line_has_key(line, key_byte))
968}
969
970fn line_has_key(line: &str, key_byte: u8) -> bool {
971 let bytes = line.as_bytes();
972 let mut i = 0usize;
973 while i < bytes.len() && matches!(bytes.get(i).copied(), Some(b' ' | b'\t')) {
975 i = i.saturating_add(1);
976 }
977 let start = i;
979 if !matches!(bytes.get(i).copied(), Some(b'a'..=b'z' | b'A'..=b'Z' | b'_')) {
980 return false;
981 }
982 i = i.saturating_add(1);
983 while i < bytes.len()
984 && matches!(
985 bytes.get(i).copied(),
986 Some(b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' | b'.')
987 )
988 {
989 i = i.saturating_add(1);
990 }
991 if i == start {
992 return false;
993 }
994 while i < bytes.len() && matches!(bytes.get(i).copied(), Some(b' ' | b'\t')) {
996 i = i.saturating_add(1);
997 }
998 bytes.get(i).copied() == Some(key_byte)
999}
1000
1001fn suppression_regex() -> &'static Regex {
1002 static RE: OnceLock<Regex> = OnceLock::new();
1003 RE.get_or_init(|| {
1010 compile_static(
1011 r"^ {0,3}<!--\s*mdwright:\s*(?P<kind>allow-next-line|allow|disable-all|enable-all|disable|enable)(?:[ \t]+(?P<names>[\w\-,\s]+?))?\s*-->\s*$",
1012 )
1013 })
1014}
1015
1016fn scan_suppressions(html_blocks: &[HtmlBlock]) -> Vec<Suppression> {
1021 let mut out = Vec::new();
1022 let re = suppression_regex();
1023 for block in html_blocks {
1024 let trimmed = block.text.trim_end();
1025 let Some(caps) = re.captures(trimmed) else {
1026 continue;
1027 };
1028 let Some(kind_match) = caps.name("kind") else {
1029 continue;
1030 };
1031 let kind = match kind_match.as_str() {
1032 "allow" => SuppressionKind::Allow {
1033 scope: AllowScope::Block,
1034 },
1035 "allow-next-line" => SuppressionKind::Allow {
1036 scope: AllowScope::NextLine,
1037 },
1038 "disable" | "disable-all" => SuppressionKind::Disable,
1039 "enable" | "enable-all" => SuppressionKind::Enable,
1040 _ => continue,
1041 };
1042 let rules: Vec<String> = caps
1043 .name("names")
1044 .map_or("", |m| m.as_str())
1045 .split([',', ' ', '\t'])
1046 .filter(|s| !s.is_empty())
1047 .map(str::to_owned)
1048 .collect();
1049 if matches!(kind, SuppressionKind::Allow { .. }) && rules.is_empty() {
1053 continue;
1054 }
1055 out.push(Suppression {
1056 kind,
1057 rules,
1058 raw_range: block.raw_range.clone(),
1059 });
1060 }
1061 out
1062}
1063
1064#[cfg(test)]
1065#[allow(
1066 clippy::indexing_slicing,
1067 reason = "test asserts; panic surface is the test framework"
1068)]
1069mod tests {
1070 use super::Ir;
1071
1072 fn some_ref<'a, T>(value: Option<&'a T>, label: &str) -> Result<&'a T, String> {
1073 match value {
1074 Some(value) => Ok(value),
1075 None => Err(label.to_owned()),
1076 }
1077 }
1078
1079 #[test]
1080 fn prose_chunks_include_backslash_escapes() {
1081 let ir = Ir::parse_str(r"a \_b\_ c");
1082 let texts: Vec<&str> = ir.prose_chunks.iter().map(|c| c.text.as_str()).collect();
1083 assert!(
1084 texts.iter().any(|t| t.contains(r"\_")),
1085 "prose chunks should preserve `\\_`: {texts:?}"
1086 );
1087 }
1088
1089 #[test]
1090 fn fenced_code_excluded_from_prose() {
1091 let src = "before\n```\nx \\_y\\_ z\n```\nafter \\_outside\\_\n";
1092 let ir = Ir::parse_str(src);
1093 for c in &ir.prose_chunks {
1095 assert!(!c.text.contains("\\_y"), "prose chunk leaked code body: {:?}", c.text);
1096 }
1097 let texts: Vec<&str> = ir.prose_chunks.iter().map(|c| c.text.as_str()).collect();
1102 assert!(texts.iter().any(|t| t.contains("\\_")), "no chunk has `\\_`: {texts:?}");
1103 assert!(
1104 texts.iter().any(|t| t.contains("outside")),
1105 "no chunk has `outside`: {texts:?}"
1106 );
1107 assert_eq!(ir.code_blocks.len(), 1);
1108 }
1109
1110 #[test]
1111 fn inline_code_strips_fences() -> Result<(), String> {
1112 let ir = Ir::parse_str("see `foo_bar` here\n");
1113 assert_eq!(ir.inline_codes.len(), 1);
1114 let code = some_ref(ir.inline_codes.first(), "missing")?;
1115 assert_eq!(code.text, "foo_bar");
1116 Ok(())
1117 }
1118
1119 #[test]
1120 fn frontmatter_split() -> Result<(), String> {
1121 let src = "---\ntitle: T\n---\nbody text\n";
1122 let ir = Ir::parse_str(src);
1123 let fm = some_ref(ir.frontmatter.as_ref(), "frontmatter")?;
1124 assert_eq!(fm.delimiter, super::FrontmatterDelimiter::Yaml);
1125 let body_chunks: Vec<&str> = ir.prose_chunks.iter().map(|c| c.text.as_str()).collect();
1126 assert!(body_chunks.iter().any(|t| t == &"body text"));
1127 Ok(())
1128 }
1129
1130 #[test]
1131 fn frontmatter_opener_without_close_is_thematic_break() -> Result<(), String> {
1132 let src = "---\n\n- a\n- a\n\n- a\n";
1138 let ir = Ir::parse_str(src);
1139 assert!(ir.frontmatter.is_none(), "no frontmatter without close");
1140 let any_a = ir.prose_chunks.iter().any(|c| c.text == "a");
1141 assert!(
1142 any_a,
1143 "body markdown should be parsed as prose, got {:?}",
1144 ir.prose_chunks
1145 );
1146 Ok(())
1147 }
1148
1149 #[test]
1150 fn frontmatter_toml_split() -> Result<(), String> {
1151 let src = "+++\ntitle = \"T\"\n+++\nbody text\n";
1152 let ir = Ir::parse_str(src);
1153 let fm = some_ref(ir.frontmatter.as_ref(), "frontmatter")?;
1154 assert_eq!(fm.delimiter, super::FrontmatterDelimiter::Toml);
1155 let body_chunks: Vec<&str> = ir.prose_chunks.iter().map(|c| c.text.as_str()).collect();
1156 assert!(body_chunks.iter().any(|t| t == &"body text"));
1157 Ok(())
1158 }
1159
1160 #[test]
1161 fn headings_trimmed_and_levelled() {
1162 let ir = Ir::parse_str("# One\n\n## Two ##\n\n### Three\n");
1163 assert_eq!(ir.headings.len(), 3);
1164 let texts: Vec<(&str, u32)> = ir.headings.iter().map(|h| (h.text.as_str(), h.level)).collect();
1165 assert_eq!(texts, vec![("One", 1), ("Two", 2), ("Three", 3)]);
1166 }
1167
1168 #[test]
1169 fn list_groups_record_markers() -> Result<(), String> {
1170 let src = "- one\n- two\n* three\n";
1171 let ir = Ir::parse_str(src);
1172 assert_eq!(ir.list_groups.len(), 2);
1173 let g1 = some_ref(ir.list_groups.first(), "first list")?;
1174 assert!(!g1.ordered);
1175 let markers: Vec<u8> = g1.items.iter().map(|i| i.marker_byte).collect();
1176 assert_eq!(markers, vec![b'-', b'-']);
1177 let g2 = some_ref(ir.list_groups.get(1), "second list")?;
1178 let item = some_ref(g2.items.first(), "item")?;
1179 assert_eq!(item.marker_byte, b'*');
1180 Ok(())
1181 }
1182
1183 #[test]
1184 fn link_defs_scanned() -> Result<(), String> {
1185 let src = "[bar]: https://example.com\n\nSee [ref][bar].\n";
1186 let ir = Ir::parse_str(src);
1187 let target = some_ref(ir.refs.iter().next(), "expected one target")?;
1188 assert_eq!(target.label_raw, "bar");
1189 assert_eq!(target.dest, "https://example.com");
1190 Ok(())
1191 }
1192
1193 #[test]
1194 fn link_defs_skipped_inside_code_block() {
1195 let src = "```\n[bar]: https://example.com\n```\n";
1196 let ir = Ir::parse_str(src);
1197 assert!(ir.refs.is_empty());
1198 }
1199
1200 #[test]
1201 fn inline_html_collected() {
1202 let src = "before <span>x</span> after\n";
1203 let ir = Ir::parse_str(src);
1204 assert!(ir.inline_html.iter().any(|h| h.text == "<span>"));
1205 assert!(ir.inline_html.iter().any(|h| h.text == "</span>"));
1206 }
1207
1208 #[test]
1209 fn code_block_info_string() -> Result<(), String> {
1210 let src = "```rust\nfn x() {}\n```\n";
1211 let ir = Ir::parse_str(src);
1212 assert_eq!(ir.code_blocks.len(), 1);
1213 let cb = some_ref(ir.code_blocks.first(), "cb")?;
1214 assert_eq!(cb.info, "rust");
1215 assert!(cb.fenced);
1216 Ok(())
1217 }
1218
1219 use super::{AllowScope, SuppressionKind};
1220
1221 #[test]
1222 fn suppression_allow_parses() -> Result<(), String> {
1223 let src = "<!-- mdwright: allow heading-punctuation -->\n# Title.\n";
1224 let ir = Ir::parse_str(src);
1225 assert_eq!(ir.suppressions.len(), 1);
1226 let s = some_ref(ir.suppressions.first(), "first")?;
1227 assert_eq!(
1228 s.kind,
1229 SuppressionKind::Allow {
1230 scope: AllowScope::Block
1231 }
1232 );
1233 assert_eq!(s.rules, vec!["heading-punctuation"]);
1234 Ok(())
1235 }
1236
1237 #[test]
1238 fn suppression_allow_next_line_parses() -> Result<(), String> {
1239 let src = "<!-- mdwright: allow-next-line trailing-whitespace -->\nfoo \n";
1240 let ir = Ir::parse_str(src);
1241 let s = some_ref(ir.suppressions.first(), "first")?;
1242 assert_eq!(
1243 s.kind,
1244 SuppressionKind::Allow {
1245 scope: AllowScope::NextLine
1246 }
1247 );
1248 Ok(())
1249 }
1250
1251 #[test]
1252 fn suppression_multiple_rules_parses() -> Result<(), String> {
1253 let src = "<!-- mdwright: allow rule-a, rule-b, rule-c -->\nbody\n";
1254 let ir = Ir::parse_str(src);
1255 let s = some_ref(ir.suppressions.first(), "first")?;
1256 assert_eq!(s.rules, vec!["rule-a", "rule-b", "rule-c"]);
1257 Ok(())
1258 }
1259
1260 #[test]
1261 fn suppression_disable_enable_parse() -> Result<(), String> {
1262 let src = "<!-- mdwright: disable bare-url -->\n\nfoo\n\n<!-- mdwright: enable bare-url -->\n";
1263 let ir = Ir::parse_str(src);
1264 assert_eq!(ir.suppressions.len(), 2);
1265 let first = some_ref(ir.suppressions.first(), "first")?;
1266 let second = some_ref(ir.suppressions.get(1), "second")?;
1267 assert_eq!(first.kind, SuppressionKind::Disable);
1268 assert_eq!(second.kind, SuppressionKind::Enable);
1269 Ok(())
1270 }
1271
1272 #[test]
1273 fn suppression_disable_all_alias_parses() -> Result<(), String> {
1274 let src = "<!-- mdwright: disable-all -->\nfoo\n";
1275 let ir = Ir::parse_str(src);
1276 let s = some_ref(ir.suppressions.first(), "first")?;
1277 assert_eq!(s.kind, SuppressionKind::Disable);
1278 assert!(s.rules.is_empty());
1279 Ok(())
1280 }
1281
1282 #[test]
1283 fn suppression_bare_allow_rejected() {
1284 let src = "<!-- mdwright: allow -->\n# Title\n";
1286 let ir = Ir::parse_str(src);
1287 assert!(ir.suppressions.is_empty());
1288 }
1289
1290 #[test]
1291 fn suppression_inline_html_ignored() {
1292 let src = "Some text <!-- mdwright: allow bare-url --> more text.\n";
1296 let ir = Ir::parse_str(src);
1297 assert!(ir.suppressions.is_empty());
1298 }
1299
1300 #[test]
1301 fn suppression_with_indent_parses() -> Result<(), String> {
1302 let src = " <!-- mdwright: allow heading-punctuation -->\n# Title.\n";
1304 let ir = Ir::parse_str(src);
1305 let s = some_ref(ir.suppressions.first(), "first")?;
1306 assert_eq!(s.rules, vec!["heading-punctuation"]);
1307 Ok(())
1308 }
1309
1310 use super::compute_transparent_runs;
1311
1312 #[test]
1313 fn transparent_runs_for_blockquote_continuation() {
1314 let src = "> a\n> b\n";
1317 let bq = 0..src.len();
1318 let runs = compute_transparent_runs(src, std::slice::from_ref(&bq), &[]);
1319 assert_eq!(runs, vec![0..2, 4..6]);
1320 }
1321
1322 #[test]
1323 fn transparent_runs_for_nested_blockquote() {
1324 let src = "> > a\n> > b\n";
1327 let outer = 0..src.len();
1328 let inner = 2..src.len();
1329 let runs = compute_transparent_runs(src, &[outer, inner], &[]);
1330 assert_eq!(runs, vec![0..4, 6..10]);
1331 }
1332
1333 #[test]
1334 fn transparent_runs_for_list_item_continuation() {
1335 let src = "1. a\n b\n";
1338 let item = (0..src.len(), 3);
1339 let runs = compute_transparent_runs(src, &[], &[item]);
1340 assert_eq!(runs, vec![5..8]);
1341 }
1342
1343 #[test]
1344 fn transparent_runs_empty_for_plain_paragraph() {
1345 let src = "hello\nworld\n";
1347 let runs = compute_transparent_runs(src, &[], &[]);
1348 assert!(runs.is_empty(), "expected empty: {runs:?}");
1349 }
1350}