marki_parse/lib.rs
1//! A fast, zero-copy `CommonMark` parser with SIMD-accelerated scanning.
2//!
3//! `marki-parse` parses Markdown into structured [`Section`] and [`Inline`] elements,
4//! borrowing directly from the input string with no intermediate allocations for
5//! text content.
6//!
7//! # Quick start
8//!
9//! ```
10//! use marki_parse::MarkdownFile;
11//!
12//! let md: MarkdownFile<'_> = MarkdownFile::parse("# Hello\n\nSome **bold** text.");
13//! for section in &md.sections {
14//! println!("{section:?}");
15//! }
16//! ```
17//!
18//! # CRLF input
19//!
20//! The parser operates on LF (`\n`) line endings. For input that may contain
21//! `\r\n`, call [`MarkdownFile::normalize`] first — it returns the input borrowed when no
22//! `\r` is present (zero cost):
23//!
24//! ```
25//! let input = "# Hello\r\nWorld";
26//! let normalized = marki_parse::MarkdownFile::normalize(input);
27//! let md: marki_parse::MarkdownFile<'_> = marki_parse::MarkdownFile::parse(&normalized);
28//! ```
29//!
30//! # Accessing inline elements
31//!
32//! Inline elements are stored in a flat pool for cache efficiency. Use
33//! [`MarkdownFile::inlines`] and [`MarkdownFile::item_spans`] (or index with
34//! [`InlineSpan`] / [`SpanSlice`]) to retrieve them:
35//!
36//! ```
37//! use marki_parse::{MarkdownFile, Section};
38//!
39//! let md: MarkdownFile<'_> = MarkdownFile::parse("Hello **world**");
40//! if let Some(Section::Paragraph { content }) = md.sections.first() {
41//! for inline in md.inlines(*content) {
42//! println!("{inline:?}");
43//! }
44//! }
45//! ```
46
47mod block;
48mod inline;
49mod section;
50pub(crate) mod simd;
51mod special_char;
52
53#[cfg(test)]
54mod fuzz_finds;
55#[cfg(test)]
56mod tests;
57
58pub use inline::Inline;
59
60/// Convert collection lengths into the `u32` offsets used by spans.
61pub(crate) trait OffsetExt {
62 fn pool_offset(self) -> u32;
63 fn lines_offset(self) -> u32;
64}
65
66impl OffsetExt for usize {
67 fn pool_offset(self) -> u32 {
68 u32::try_from(self).expect("inline pool exceeds u32::MAX elements")
69 }
70
71 fn lines_offset(self) -> u32 {
72 u32::try_from(self).expect("lines pool exceeds u32::MAX elements")
73 }
74}
75use crate::simd::ByteSliceExt;
76pub use section::{InlineSpan, OrderedListDelimiter, Section, SpanSlice};
77pub use special_char::SpecialChar;
78
79use std::borrow::Cow;
80
81/// A parsed Markdown document.
82///
83/// Contains the block-level [`Section`]s and the internal pools that store
84/// [`Inline`] elements. Use [`inlines`](Self::inlines) and
85/// [`item_spans`](Self::item_spans) to access inline content referenced by
86/// sections.
87///
88/// The const generics `MAX_INLINE_DEPTH` and `INLINE_STACK_CAP` control
89/// recursion depth and stack-allocation size for emphasis parsing. The
90/// defaults (`16` and `32`) are suitable for virtually all real-world input.
91#[derive(Debug, Clone, PartialEq, Eq)]
92pub struct MarkdownFile<'src, const MAX_INLINE_DEPTH: u8 = 16, const INLINE_STACK_CAP: usize = 32> {
93 /// The block-level sections of the document, in order.
94 pub sections: Vec<Section<'src>>,
95 pool: Vec<Inline<'src>>,
96 span_pool: Vec<InlineSpan>,
97}
98
99/// On the default `MarkdownFile` (depth=16, cap=32) we expose `normalize` as
100/// an associated fn so callers don't need to import a free function.
101impl MarkdownFile<'_, 16, 32> {
102 /// Normalize line endings for parsing. Converts `\r\n` to `\n` and bare
103 /// `\r` (classic Mac) to `\n`. Returns the input borrowed if no carriage
104 /// returns are found (zero cost).
105 ///
106 /// Call this before [`MarkdownFile::parse`] when input may contain CRLF:
107 ///
108 /// ```
109 /// let input = "# Hello\r\nWorld";
110 /// let normalized = marki_parse::MarkdownFile::normalize(input);
111 /// let md: marki_parse::MarkdownFile<'_> = marki_parse::MarkdownFile::parse(&normalized);
112 /// ```
113 #[must_use]
114 pub fn normalize(input: &str) -> Cow<'_, str> {
115 let bytes = input.as_bytes();
116 if bytes
117 .find_byte(0, SpecialChar::CarriageReturn.byte())
118 .is_none()
119 {
120 return Cow::Borrowed(input);
121 }
122 let mut out = String::with_capacity(input.len());
123 let mut start = 0;
124 while let Some(cr) = bytes.find_byte(start, SpecialChar::CarriageReturn.byte()) {
125 out.push_str(&input[start..cr]);
126 out.push('\n');
127 start = cr + 1;
128 if bytes.get(start) == Some(&SpecialChar::Newline.byte()) {
129 start += 1;
130 }
131 }
132 out.push_str(&input[start..]);
133 Cow::Owned(out)
134 }
135}
136
137impl<'src, const MAX_INLINE_DEPTH: u8, const INLINE_STACK_CAP: usize>
138 MarkdownFile<'src, MAX_INLINE_DEPTH, INLINE_STACK_CAP>
139{
140 /// Get the inline elements referenced by a span.
141 #[must_use]
142 pub fn inlines(&self, span: InlineSpan) -> &[Inline<'src>] {
143 &self[span]
144 }
145
146 /// Get the item spans referenced by a `SpanSlice` (list items).
147 #[must_use]
148 pub fn item_spans(&self, slice: SpanSlice) -> &[InlineSpan] {
149 &self[slice]
150 }
151
152 /// Walk every section and dereference every inline span. Used in tests and
153 /// fuzz targets to assert the parser does not panic on arbitrary input.
154 #[cfg(test)]
155 pub(crate) fn walk_all_inlines(&self) {
156 for section in &self.sections {
157 match section {
158 Section::UnorderedList { items } | Section::OrderedList { items, .. } => {
159 for &span in self.item_spans(*items) {
160 let _ = self.inlines(span);
161 }
162 }
163 Section::Heading { content, .. }
164 | Section::Paragraph { content }
165 | Section::Blockquote { content } => {
166 let _ = self.inlines(*content);
167 }
168 Section::CodeBlock { .. } | Section::HorizontalRule => {}
169 }
170 }
171 }
172}
173
174impl<'src, const MAX_INLINE_DEPTH: u8, const INLINE_STACK_CAP: usize> std::ops::Index<InlineSpan>
175 for MarkdownFile<'src, MAX_INLINE_DEPTH, INLINE_STACK_CAP>
176{
177 type Output = [Inline<'src>];
178
179 fn index(&self, span: InlineSpan) -> &[Inline<'src>] {
180 let start = span.start as usize;
181 let end = start + span.len as usize;
182 &self.pool[start..end]
183 }
184}
185
186impl<const MAX_INLINE_DEPTH: u8, const INLINE_STACK_CAP: usize> std::ops::Index<SpanSlice>
187 for MarkdownFile<'_, MAX_INLINE_DEPTH, INLINE_STACK_CAP>
188{
189 type Output = [InlineSpan];
190
191 fn index(&self, slice: SpanSlice) -> &[InlineSpan] {
192 let start = slice.start as usize;
193 let end = start + slice.len as usize;
194 &self.span_pool[start..end]
195 }
196}