marki_parse/lib.rs
1//! A fast, zero-copy `CommonMark` parser with SIMD-accelerated scanning.
2//!
3//! `marki-parse` parses Markdown into structured [`Section`] and [`Inline`] elements,
4//! borrowing directly from the input string with no intermediate allocations for
5//! text content.
6//!
7//! # Quick start
8//!
9//! ```
10//! use marki_parse::MarkdownFile;
11//!
12//! let md: MarkdownFile<'_> = MarkdownFile::parse("# Hello\n\nSome **bold** text.");
13//! for section in &md.sections {
14//! println!("{section:?}");
15//! }
16//! ```
17//!
18//! # CRLF input
19//!
20//! The parser operates on LF (`\n`) line endings. For input that may contain
21//! `\r\n`, call [`normalize`] first — it returns the input borrowed when no
22//! `\r` is present (zero cost):
23//!
24//! ```
25//! let input = "# Hello\r\nWorld";
26//! let normalized = marki_parse::normalize(input);
27//! let md: marki_parse::MarkdownFile<'_> = marki_parse::MarkdownFile::parse(&normalized);
28//! ```
29//!
30//! # Accessing inline elements
31//!
32//! Inline elements are stored in a flat pool for cache efficiency. Use
33//! [`MarkdownFile::inlines`] and [`MarkdownFile::item_spans`] (or index with
34//! [`InlineSpan`] / [`SpanSlice`]) to retrieve them:
35//!
36//! ```
37//! use marki_parse::{MarkdownFile, Section};
38//!
39//! let md: MarkdownFile<'_> = MarkdownFile::parse("Hello **world**");
40//! if let Some(Section::Paragraph { content }) = md.sections.first() {
41//! for inline in md.inlines(*content) {
42//! println!("{inline:?}");
43//! }
44//! }
45//! ```
46
47mod block;
48mod inline;
49mod section;
50pub(crate) mod simd;
51mod special_char;
52
53#[cfg(test)]
54mod fuzz_finds;
55#[cfg(test)]
56mod tests;
57
58use std::borrow::Cow;
59
60pub use inline::Inline;
61pub use section::{InlineSpan, OrderedListDelimiter, Section, SpanSlice};
62pub use special_char::SpecialChar;
63
64/// Normalize line endings for parsing. Converts `\r\n` to `\n` and bare `\r`
65/// (classic Mac) to `\n`. Returns the input borrowed if no carriage returns
66/// are found.
67///
68/// Use this before [`MarkdownFile::parse`] when the input may contain CRLF
69/// line endings:
70///
71/// ```
72/// let input = "# Hello\r\nWorld";
73/// let normalized = marki_parse::normalize(input);
74/// let md: marki_parse::MarkdownFile<'_> = marki_parse::MarkdownFile::parse(&normalized);
75/// ```
76#[must_use]
77pub fn normalize(input: &str) -> Cow<'_, str> {
78 let bytes = input.as_bytes();
79 if simd::find_byte(bytes, 0, SpecialChar::CarriageReturn.byte()).is_none() {
80 return Cow::Borrowed(input);
81 }
82 // Single-pass: copy chunks between \r characters, replacing each \r with
83 // \n and consuming the following \n in \r\n pairs.
84 let mut out = String::with_capacity(input.len());
85 let mut start = 0;
86 while let Some(cr) = simd::find_byte(bytes, start, SpecialChar::CarriageReturn.byte()) {
87 out.push_str(&input[start..cr]);
88 out.push('\n');
89 start = cr + 1;
90 // Consume the \n in a \r\n pair so it doesn't become a double newline.
91 if bytes.get(start) == Some(&SpecialChar::Newline.byte()) {
92 start += 1;
93 }
94 }
95 out.push_str(&input[start..]);
96 Cow::Owned(out)
97}
98
99/// A parsed Markdown document.
100///
101/// Contains the block-level [`Section`]s and the internal pools that store
102/// [`Inline`] elements. Use [`inlines`](Self::inlines) and
103/// [`item_spans`](Self::item_spans) to access inline content referenced by
104/// sections.
105///
106/// The const generics `MAX_INLINE_DEPTH` and `INLINE_STACK_CAP` control
107/// recursion depth and stack-allocation size for emphasis parsing. The
108/// defaults (`16` and `32`) are suitable for virtually all real-world input.
109#[derive(Debug, Clone, PartialEq, Eq)]
110pub struct MarkdownFile<'src, const MAX_INLINE_DEPTH: u8 = 16, const INLINE_STACK_CAP: usize = 32> {
111 /// The block-level sections of the document, in order.
112 pub sections: Vec<Section<'src>>,
113 pool: Vec<Inline<'src>>,
114 span_pool: Vec<InlineSpan>,
115}
116
117impl<'src, const MAX_INLINE_DEPTH: u8, const INLINE_STACK_CAP: usize>
118 MarkdownFile<'src, MAX_INLINE_DEPTH, INLINE_STACK_CAP>
119{
120 /// Get the inline elements referenced by a span.
121 #[must_use]
122 pub fn inlines(&self, span: InlineSpan) -> &[Inline<'src>] {
123 &self[span]
124 }
125
126 /// Get the item spans referenced by a `SpanSlice` (list items).
127 #[must_use]
128 pub fn item_spans(&self, slice: SpanSlice) -> &[InlineSpan] {
129 &self[slice]
130 }
131}
132
133impl<'src, const MAX_INLINE_DEPTH: u8, const INLINE_STACK_CAP: usize> std::ops::Index<InlineSpan>
134 for MarkdownFile<'src, MAX_INLINE_DEPTH, INLINE_STACK_CAP>
135{
136 type Output = [Inline<'src>];
137
138 fn index(&self, span: InlineSpan) -> &[Inline<'src>] {
139 let start = span.start as usize;
140 let end = start + span.len as usize;
141 &self.pool[start..end]
142 }
143}
144
145impl<const MAX_INLINE_DEPTH: u8, const INLINE_STACK_CAP: usize> std::ops::Index<SpanSlice>
146 for MarkdownFile<'_, MAX_INLINE_DEPTH, INLINE_STACK_CAP>
147{
148 type Output = [InlineSpan];
149
150 fn index(&self, slice: SpanSlice) -> &[InlineSpan] {
151 let start = slice.start as usize;
152 let end = start + slice.len as usize;
153 &self.span_pool[start..end]
154 }
155}