Skip to main content

marki_parse/
lib.rs

1//! A fast, zero-copy `CommonMark` parser with SIMD-accelerated scanning.
2//!
3//! `marki-parse` parses Markdown into structured [`Section`] and [`Inline`] elements,
4//! borrowing directly from the input string with no intermediate allocations for
5//! text content.
6//!
7//! # Quick start
8//!
9//! ```
10//! use marki_parse::MarkdownFile;
11//!
12//! let md: MarkdownFile<'_> = MarkdownFile::parse("# Hello\n\nSome **bold** text.");
13//! for section in &md.sections {
14//!     println!("{section:?}");
15//! }
16//! ```
17//!
18//! # CRLF input
19//!
20//! The parser operates on LF (`\n`) line endings. For input that may contain
21//! `\r\n`, call [`MarkdownFile::normalize`] first — it returns the input borrowed when no
22//! `\r` is present (zero cost):
23//!
24//! ```
25//! let input = "# Hello\r\nWorld";
26//! let normalized = marki_parse::MarkdownFile::normalize(input);
27//! let md: marki_parse::MarkdownFile<'_> = marki_parse::MarkdownFile::parse(&normalized);
28//! ```
29//!
30//! # Accessing inline elements
31//!
32//! Inline elements are stored in a flat pool for cache efficiency. Use
33//! [`MarkdownFile::inlines`] and [`MarkdownFile::item_spans`] (or index with
34//! [`InlineSpan`] / [`SpanSlice`]) to retrieve them:
35//!
36//! ```
37//! use marki_parse::{MarkdownFile, Section};
38//!
39//! let md: MarkdownFile<'_> = MarkdownFile::parse("Hello **world**");
40//! if let Some(Section::Paragraph { content }) = md.sections.first() {
41//!     for inline in md.inlines(*content) {
42//!         println!("{inline:?}");
43//!     }
44//! }
45//! ```
46
47mod block;
48mod inline;
49mod section;
50pub(crate) mod simd;
51mod special_char;
52
53#[cfg(test)]
54mod fuzz_finds;
55#[cfg(test)]
56mod tests;
57
58pub use inline::Inline;
59
60/// Convert collection lengths into the `u32` offsets used by spans.
61pub(crate) trait OffsetExt {
62    fn pool_offset(self) -> u32;
63    fn lines_offset(self) -> u32;
64}
65
66impl OffsetExt for usize {
67    fn pool_offset(self) -> u32 {
68        u32::try_from(self).expect("inline pool exceeds u32::MAX elements")
69    }
70
71    fn lines_offset(self) -> u32 {
72        u32::try_from(self).expect("lines pool exceeds u32::MAX elements")
73    }
74}
75use crate::simd::ByteSliceExt;
76pub use section::{InlineSpan, OrderedListDelimiter, Section, SpanSlice};
77pub use special_char::SpecialChar;
78
79use std::borrow::Cow;
80
81/// A parsed Markdown document.
82///
83/// Contains the block-level [`Section`]s and the internal pools that store
84/// [`Inline`] elements. Use [`inlines`](Self::inlines) and
85/// [`item_spans`](Self::item_spans) to access inline content referenced by
86/// sections.
87///
88/// The const generics `MAX_INLINE_DEPTH` and `INLINE_STACK_CAP` control
89/// recursion depth and stack-allocation size for emphasis parsing. The
90/// defaults (`16` and `32`) are suitable for virtually all real-world input.
91#[derive(Debug, Clone, PartialEq, Eq)]
92pub struct MarkdownFile<'src, const MAX_INLINE_DEPTH: u8 = 16, const INLINE_STACK_CAP: usize = 32> {
93    /// The block-level sections of the document, in order.
94    pub sections: Vec<Section<'src>>,
95    pool: Vec<Inline<'src>>,
96    span_pool: Vec<InlineSpan>,
97}
98
99/// On the default `MarkdownFile` (depth=16, cap=32) we expose `normalize` as
100/// an associated fn so callers don't need to import a free function.
101impl MarkdownFile<'_, 16, 32> {
102    /// Normalize line endings for parsing. Converts `\r\n` to `\n` and bare
103    /// `\r` (classic Mac) to `\n`. Returns the input borrowed if no carriage
104    /// returns are found (zero cost).
105    ///
106    /// Call this before [`MarkdownFile::parse`] when input may contain CRLF:
107    ///
108    /// ```
109    /// let input = "# Hello\r\nWorld";
110    /// let normalized = marki_parse::MarkdownFile::normalize(input);
111    /// let md: marki_parse::MarkdownFile<'_> = marki_parse::MarkdownFile::parse(&normalized);
112    /// ```
113    #[must_use]
114    pub fn normalize(input: &str) -> Cow<'_, str> {
115        let bytes = input.as_bytes();
116        if bytes
117            .find_byte(0, SpecialChar::CarriageReturn.byte())
118            .is_none()
119        {
120            return Cow::Borrowed(input);
121        }
122        let mut out = String::with_capacity(input.len());
123        let mut start = 0;
124        while let Some(cr) = bytes.find_byte(start, SpecialChar::CarriageReturn.byte()) {
125            out.push_str(&input[start..cr]);
126            out.push('\n');
127            start = cr + 1;
128            if bytes.get(start) == Some(&SpecialChar::Newline.byte()) {
129                start += 1;
130            }
131        }
132        out.push_str(&input[start..]);
133        Cow::Owned(out)
134    }
135}
136
137impl<'src, const MAX_INLINE_DEPTH: u8, const INLINE_STACK_CAP: usize>
138    MarkdownFile<'src, MAX_INLINE_DEPTH, INLINE_STACK_CAP>
139{
140    /// Get the inline elements referenced by a span.
141    #[must_use]
142    pub fn inlines(&self, span: InlineSpan) -> &[Inline<'src>] {
143        &self[span]
144    }
145
146    /// Get the item spans referenced by a `SpanSlice` (list items).
147    #[must_use]
148    pub fn item_spans(&self, slice: SpanSlice) -> &[InlineSpan] {
149        &self[slice]
150    }
151
152    /// Walk every section and dereference every inline span. Used in tests and
153    /// fuzz targets to assert the parser does not panic on arbitrary input.
154    #[cfg(test)]
155    pub(crate) fn walk_all_inlines(&self) {
156        for section in &self.sections {
157            match section {
158                Section::UnorderedList { items } | Section::OrderedList { items, .. } => {
159                    for &span in self.item_spans(*items) {
160                        let _ = self.inlines(span);
161                    }
162                }
163                Section::Heading { content, .. }
164                | Section::Paragraph { content }
165                | Section::Blockquote { content } => {
166                    let _ = self.inlines(*content);
167                }
168                Section::CodeBlock { .. } | Section::HorizontalRule => {}
169            }
170        }
171    }
172}
173
174impl<'src, const MAX_INLINE_DEPTH: u8, const INLINE_STACK_CAP: usize> std::ops::Index<InlineSpan>
175    for MarkdownFile<'src, MAX_INLINE_DEPTH, INLINE_STACK_CAP>
176{
177    type Output = [Inline<'src>];
178
179    fn index(&self, span: InlineSpan) -> &[Inline<'src>] {
180        let start = span.start as usize;
181        let end = start + span.len as usize;
182        &self.pool[start..end]
183    }
184}
185
186impl<const MAX_INLINE_DEPTH: u8, const INLINE_STACK_CAP: usize> std::ops::Index<SpanSlice>
187    for MarkdownFile<'_, MAX_INLINE_DEPTH, INLINE_STACK_CAP>
188{
189    type Output = [InlineSpan];
190
191    fn index(&self, slice: SpanSlice) -> &[InlineSpan] {
192        let start = slice.start as usize;
193        let end = start + slice.len as usize;
194        &self.span_pool[start..end]
195    }
196}