basalt_core/
markdown.rs

1//! A Markdown parser that transforms Markdown input into a custom abstract syntax tree (AST)
2//! intented to be rendered with [basalt](https://github.com/erikjuhani/basalt)—a TUI application
3//! for Obsidian.
4//!
5//! This module provides a [`Parser`] type, which processes raw Markdown input into a [`Vec`] of
6//! [`Node`]s. These [`Node`]s represent semantic elements such as headings, paragraphs, block
7//! quotes, and code blocks.
8//!
9//! The parser is built on top of [`pulldown_cmark`].
10//!
11//! ## Simple usage
12//!
13//! At the simplest level, you can parse a Markdown string by calling the [`from_str`] function:
14//!
15//! ```
16//! use basalt_core::markdown::{from_str, Node, HeadingLevel, Text};
17//!
18//! let markdown = "# My Heading\n\nSome text.";
19//! let nodes = from_str(markdown);
20//!
21//! assert_eq!(nodes, vec![
22//!   Node::Heading {
23//!     level: HeadingLevel::H1,
24//!     text: Text::from("My Heading"),
25//!   },
26//!   Node::Paragraph {
27//!     text: Text::from("Some text."),
28//!   },
29//! ])
30//! ```
31//!
32//! ## Implementation details
33//!
34//! The [`Parser`] processes [`pulldown_cmark::Event`]s one by one, building up the current
35//! [`Node`] in `current_node`. When an event indicates the start of a new structure (e.g.,
36//! `Event::Start(Tag::Heading {..})`), the [`Parser`] pushes or replaces the current node
37//! with a new one. When an event indicates the end of that structure, the node is finalized
38//! and pushed into [`Parser::output`].
39//!
40//! Unrecognized events (such as [`InlineHtml`](pulldown_cmark::Event::InlineHtml)) are simply
41//! ignored for the time being.
42//!
43//! ## Not yet implemented
44//!
45//! - Handling of inline HTML, math blocks, etc.
46//! - Tracking code block language (`lang`) properly (currently set to [`None`]).
47use std::vec::IntoIter;
48
49use pulldown_cmark::{Event, Options, Tag, TagEnd};
50
51/// A style that can be applied to [`TextNode`] (code, emphasis, strikethrough, strong).
52#[derive(Clone, Debug, PartialEq)]
53pub enum Style {
54    /// Inline code style (e.g. `code`).
55    Code,
56    /// Italic/emphasis style (e.g. `*emphasis*`).
57    Emphasis,
58    /// Strikethrough style (e.g. `~~strikethrough~~`).
59    Strikethrough,
60    /// Bold/strong style (e.g. `**strong**`).
61    Strong,
62}
63
64/// Represents the variant of a list or task item (checked, unchecked, etc.).
65#[derive(Clone, Debug, PartialEq)]
66pub enum ItemKind {
67    /// A checkbox item that is marked as done using `- [x]`.
68    HardChecked,
69    /// A checkbox item that is checked, but not explicitly recognized as
70    /// `HardChecked` (e.g., `- [?]`).
71    Checked,
72    /// A checkbox item that is unchecked using `- [ ]`.
73    Unchecked,
74    // TODO: Remove in favor of using List node that has children of nodes
75    /// An ordered list item (e.g., `1. item`), storing the numeric index.
76    Ordered(u64),
77    /// An unordered list item (e.g., `- item`).
78    Unordered,
79}
80
81#[derive(Clone, Debug, PartialEq)]
82#[allow(missing_docs)]
83pub enum HeadingLevel {
84    H1 = 1,
85    H2,
86    H3,
87    H4,
88    H5,
89    H6,
90}
91
92impl From<pulldown_cmark::HeadingLevel> for HeadingLevel {
93    fn from(value: pulldown_cmark::HeadingLevel) -> Self {
94        match value {
95            pulldown_cmark::HeadingLevel::H1 => HeadingLevel::H1,
96            pulldown_cmark::HeadingLevel::H2 => HeadingLevel::H2,
97            pulldown_cmark::HeadingLevel::H3 => HeadingLevel::H3,
98            pulldown_cmark::HeadingLevel::H4 => HeadingLevel::H4,
99            pulldown_cmark::HeadingLevel::H5 => HeadingLevel::H5,
100            pulldown_cmark::HeadingLevel::H6 => HeadingLevel::H6,
101        }
102    }
103}
104
105/// Represents specialized block quote kind variants (tip, note, warning, etc.).
106///
107/// Currently, the underlying [`pulldown_cmark`] parser distinguishes these via syntax like `">
108/// [!NOTE] Some note"`.
109#[derive(Clone, Debug, PartialEq)]
110#[allow(missing_docs)]
111pub enum BlockQuoteKind {
112    Note,
113    Tip,
114    Important,
115    Warning,
116    Caution,
117}
118
119impl From<pulldown_cmark::BlockQuoteKind> for BlockQuoteKind {
120    fn from(value: pulldown_cmark::BlockQuoteKind) -> Self {
121        match value {
122            pulldown_cmark::BlockQuoteKind::Tip => BlockQuoteKind::Tip,
123            pulldown_cmark::BlockQuoteKind::Note => BlockQuoteKind::Note,
124            pulldown_cmark::BlockQuoteKind::Warning => BlockQuoteKind::Warning,
125            pulldown_cmark::BlockQuoteKind::Caution => BlockQuoteKind::Caution,
126            pulldown_cmark::BlockQuoteKind::Important => BlockQuoteKind::Important,
127        }
128    }
129}
130
131/// Denotes whether a list is ordered or unordered.
132#[derive(Clone, Debug, PartialEq)]
133pub enum ListKind {
134    /// An ordered list item (e.g., `1. item`), storing the numeric index.
135    Ordered(u64),
136    /// An unordered list item (e.g., `- item`).
137    Unordered,
138}
139
140/// A single unit of text that is optionally styled (e.g., code).
141///
142/// [`TextNode`] can be any combination of sentence, words or characters.
143///
144/// Usually styled text will be contained in a single [`TextNode`] with the given [`Style`]
145/// property.
146#[derive(Clone, Debug, PartialEq, Default)]
147pub struct TextNode {
148    /// The literal text content.
149    pub content: String,
150    /// Optional inline style of the text.
151    pub style: Option<Style>,
152}
153
154impl TextNode {
155    /// Creates a new [`TextNode`] from `content` and optional [`Style`].
156    pub fn new(content: String, style: Option<Style>) -> Self {
157        Self { content, style }
158    }
159}
160
161/// A wrapper type holding a list of [`TextNode`]s.
162#[derive(Clone, Debug, PartialEq, Default)]
163pub struct Text(Vec<TextNode>);
164
165impl From<&str> for Text {
166    fn from(value: &str) -> Self {
167        Self(vec![TextNode::new(String::from(value), None)])
168    }
169}
170
171impl IntoIterator for Text {
172    type Item = TextNode;
173    type IntoIter = IntoIter<Self::Item>;
174    fn into_iter(self) -> Self::IntoIter {
175        self.0.into_iter()
176    }
177}
178
179impl Text {
180    /// Appends a [`TextNode`] to the inner text list.
181    fn push(&mut self, node: TextNode) {
182        self.0.push(node);
183    }
184}
185
186/// The Markdown AST node enumeration.
187#[derive(Clone, Debug, PartialEq)]
188#[allow(missing_docs)]
189pub enum Node {
190    /// A heading node that represents different heading levels.
191    ///
192    /// The level is controlled with the [`HeadingLevel`] definition.
193    Heading {
194        level: HeadingLevel,
195        text: Text,
196    },
197    Paragraph {
198        text: Text,
199    },
200    /// A block quote node that represents different quote block variants including callout blocks.
201    ///
202    /// The variant is controlled with the [`BlockQuoteKind`] definition. When [`BlockQuoteKind`]
203    /// is [`None`] the block quote should be interpreted as a regular block quote:
204    /// `"> Block quote"`.
205    BlockQuote {
206        kind: Option<BlockQuoteKind>,
207        nodes: Vec<Node>,
208    },
209    /// A fenced code block, optionally with a language identifier.
210    CodeBlock {
211        lang: Option<String>,
212        text: Text,
213    },
214    /// A list item node that represents different list item variants including task items.
215    ///
216    /// The variant is controlled with the [`ItemKind`] definition. When [`ItemKind`] is [`None`]
217    /// the item should be interpreted as unordered list item: `"- Item"`.
218    Item {
219        kind: Option<ItemKind>,
220        text: Text,
221    },
222}
223
224impl Node {
225    /// Pushes a [`TextNode`] into this node, if it contains a text buffer.
226    ///
227    /// If the node is a [`BlockQuote`], the [`TextNode`] will be pushed into the last child
228    /// [`Node`], if any.
229    /// ```
230    pub(crate) fn push_text_node(&mut self, node: TextNode) {
231        match self {
232            Node::Paragraph { text, .. }
233            | Node::Heading { text, .. }
234            | Node::CodeBlock { text, .. }
235            | Node::Item { text, .. } => text.push(node),
236            Node::BlockQuote { nodes, .. } => {
237                if let Some(last_node) = nodes.last_mut() {
238                    last_node.push_text_node(node);
239                }
240            }
241        }
242    }
243}
244
245/// Returns `true` if the [`Node`] should be closed upon encountering the given [`TagEnd`].
246fn matches_tag_end(node: &Node, tag_end: &TagEnd) -> bool {
247    match (node, tag_end) {
248        (Node::Paragraph { .. }, TagEnd::Paragraph)
249        | (Node::Heading { .. }, TagEnd::Heading(..))
250        | (Node::BlockQuote { .. }, TagEnd::BlockQuote(..))
251        | (Node::CodeBlock { .. }, TagEnd::CodeBlock)
252        | (Node::Item { .. }, TagEnd::Item) => true,
253        _ => false,
254    }
255}
256
257/// Parses the given Markdown input into a list of [`Node`]s.
258///
259/// This is a convenience function for constructing a [`Parser`] and calling [`Parser::parse`].  
260///
261/// # Examples
262///
263/// ```
264/// use basalt_core::markdown::{from_str, Node, HeadingLevel, Text};
265///
266/// let markdown = "# My Heading\n\nSome text.";
267/// let nodes = from_str(markdown);
268///
269/// assert_eq!(nodes, vec![
270///   Node::Heading {
271///     level: HeadingLevel::H1,
272///     text: Text::from("My Heading"),
273///   },
274///   Node::Paragraph {
275///     text: Text::from("Some text."),
276///   },
277/// ])
278/// ```
279pub fn from_str<'a>(text: &'a str) -> Vec<Node> {
280    Parser::new(text).parse()
281}
282
283/// A parser that consumes [`pulldown_cmark::Event`]s and produces a [`Vec`] of [`Node`].
284///
285/// # Examples
286///
287/// ```
288/// use basalt_core::markdown::{Parser, Node, HeadingLevel, Text};
289///
290/// let markdown = "# My Heading\n\nSome text.";
291/// let parser = Parser::new(markdown);
292/// let nodes = parser.parse();
293///
294/// assert_eq!(nodes, vec![
295///   Node::Heading {
296///     level: HeadingLevel::H1,
297///     text: Text::from("My Heading"),
298///   },
299///   Node::Paragraph {
300///     text: Text::from("Some text."),
301///   },
302/// ])
303/// ```
304pub struct Parser<'a> {
305    /// Contains the completed AST [`Node`]s.
306    pub output: Vec<Node>,
307    inner: pulldown_cmark::TextMergeStream<'a, pulldown_cmark::Parser<'a>>,
308    current_node: Option<Node>,
309}
310
311impl<'a> Iterator for Parser<'a> {
312    type Item = Event<'a>;
313    fn next(&mut self) -> Option<Self::Item> {
314        self.inner.next()
315    }
316}
317
318impl<'a> Parser<'a> {
319    /// Creates a new [`Parser`] from a Markdown input string.
320    ///
321    /// The parser uses [`pulldown_cmark::Parser::new_ext`] with [`Options::all()`] and
322    /// [`pulldown_cmark::TextMergeStream`] internally.
323    pub fn new(text: &'a str) -> Self {
324        let parser = pulldown_cmark::TextMergeStream::new(pulldown_cmark::Parser::new_ext(
325            text,
326            Options::all(),
327        ));
328
329        Self {
330            inner: parser,
331            output: vec![],
332            current_node: None,
333        }
334    }
335
336    /// Pushes a [`Node`] as a child if the current node is a [`BlockQuote`], otherwise sets it as
337    /// the `current_node`.
338    fn push_node(&mut self, node: Node) {
339        if let Some(Node::BlockQuote { nodes, .. }) = &mut self.current_node {
340            nodes.push(node);
341        } else {
342            self.set_node(&node);
343        }
344    }
345
346    /// Pushes a [`TextNode`] into the `current_node` if it exists.
347    fn push_text_node(&mut self, node: TextNode) {
348        if let Some(ref mut current) = self.current_node {
349            current.push_text_node(node);
350        }
351    }
352
353    /// Sets (or replaces) the `current_node` with a new one, discarding any old node.
354    fn set_node(&mut self, block: &Node) {
355        self.current_node.replace(block.clone());
356    }
357
358    /// Handles the start of a [`Tag`]. Pushes the matching semantic node to be processed.
359    fn tag(&mut self, tag: Tag<'a>) {
360        match tag {
361            Tag::Paragraph => self.push_node(Node::Paragraph {
362                text: Text::default(),
363            }),
364            Tag::Heading { level, .. } => self.push_node(Node::Heading {
365                level: level.into(),
366                text: Text::default(),
367            }),
368            Tag::BlockQuote(kind) => self.push_node(Node::BlockQuote {
369                kind: kind.map(|kind| kind.into()),
370                nodes: vec![],
371            }),
372            Tag::CodeBlock(_) => self.push_node(Node::CodeBlock {
373                lang: None,
374                text: Text::default(),
375            }),
376            Tag::Item => self.push_node(Node::Item {
377                kind: None,
378                text: Text::default(),
379            }),
380            // For now everything below this comment are defined as paragraph nodes
381            Tag::HtmlBlock
382            | Tag::List(_)
383            | Tag::FootnoteDefinition(_)
384            | Tag::Table(_)
385            | Tag::TableHead
386            | Tag::TableRow
387            | Tag::TableCell
388            | Tag::Emphasis
389            | Tag::Strong
390            | Tag::Strikethrough
391            | Tag::Link { .. }
392            | Tag::Image { .. }
393            | Tag::MetadataBlock(_)
394            | Tag::DefinitionList
395            | Tag::DefinitionListTitle
396            | Tag::DefinitionListDefinition => self.push_node(Node::Paragraph {
397                text: Text::default(),
398            }),
399        }
400    }
401
402    /// Handles the end of a [`Tag`], finalizing a node if matching.
403    fn tag_end(&mut self, tag_end: TagEnd) {
404        let Some(node) = self.current_node.take() else {
405            return;
406        };
407
408        if matches_tag_end(&node, &tag_end) {
409            self.output.push(node);
410        } else {
411            self.set_node(&node);
412        }
413    }
414
415    /// Processes a single [`Event`] from the underlying [`pulldown_cmark::Parser`] iterator.
416    fn handle_event(&mut self, event: Event<'a>) {
417        match event {
418            Event::Start(tag) => self.tag(tag),
419            Event::End(tag_end) => self.tag_end(tag_end),
420            Event::Text(text) => self.push_text_node(TextNode::new(text.to_string(), None)),
421            Event::Code(text) => {
422                self.push_text_node(TextNode::new(text.to_string(), Some(Style::Code)))
423            }
424            Event::TaskListMarker(checked) => {
425                if checked {
426                    self.set_node(&Node::Item {
427                        kind: Some(ItemKind::HardChecked),
428                        text: Text::default(),
429                    });
430                } else {
431                    self.set_node(&Node::Item {
432                        kind: Some(ItemKind::Unchecked),
433                        text: Text::default(),
434                    });
435                }
436            }
437            Event::InlineMath(_)
438            | Event::DisplayMath(_)
439            | Event::Html(_)
440            | Event::InlineHtml(_)
441            | Event::SoftBreak
442            | Event::HardBreak
443            | Event::Rule
444            | Event::FootnoteReference(_) => {
445                // TODO: Not yet implemented
446            }
447        }
448    }
449
450    /// Consumes the parser, processing all remaining events from the stream into a list of
451    /// [`Node`]s.
452    ///
453    /// # Examples
454    ///
455    /// ```
456    /// # use basalt_core::markdown::{Parser, Node, Text};
457    /// let parser = Parser::new("Hello world");
458    ///
459    /// let nodes = parser.parse();
460    ///
461    /// assert_eq!(nodes, vec![Node::Paragraph { text: Text::from("Hello world") }]);
462    /// ```
463    pub fn parse(mut self) -> Vec<Node> {
464        while let Some(event) = self.next() {
465            self.handle_event(event);
466        }
467
468        if let Some(node) = self.current_node.take() {
469            self.output.push(node);
470        }
471
472        self.output
473    }
474}
475
476#[cfg(test)]
477mod tests {
478    use indoc::indoc;
479
480    fn text(str: &str) -> Text {
481        Text(vec![TextNode::new(String::from(str), None)])
482    }
483
484    fn p(str: &str) -> Node {
485        Node::Paragraph { text: text(str) }
486    }
487
488    fn blockquote(nodes: Vec<Node>) -> Node {
489        Node::BlockQuote { kind: None, nodes }
490    }
491
492    fn item(str: &str) -> Node {
493        Node::Item {
494            kind: None,
495            text: text(str),
496        }
497    }
498
499    fn task(str: &str) -> Node {
500        Node::Item {
501            kind: Some(ItemKind::Unchecked),
502            text: text(str),
503        }
504    }
505
506    fn completed_task(str: &str) -> Node {
507        Node::Item {
508            kind: Some(ItemKind::HardChecked),
509            text: text(str),
510        }
511    }
512
513    fn heading(level: HeadingLevel, str: &str) -> Node {
514        Node::Heading {
515            level,
516            text: text(str),
517        }
518    }
519
520    fn h1(str: &str) -> Node {
521        heading(HeadingLevel::H1, str)
522    }
523
524    fn h2(str: &str) -> Node {
525        heading(HeadingLevel::H2, str)
526    }
527
528    fn h3(str: &str) -> Node {
529        heading(HeadingLevel::H3, str)
530    }
531
532    fn h4(str: &str) -> Node {
533        heading(HeadingLevel::H4, str)
534    }
535
536    fn h5(str: &str) -> Node {
537        heading(HeadingLevel::H5, str)
538    }
539
540    fn h6(str: &str) -> Node {
541        heading(HeadingLevel::H6, str)
542    }
543
544    use super::*;
545
546    #[test]
547    fn test_parse() {
548        let tests = [
549            (
550                indoc! {r#"# Heading 1
551
552                ## Heading 2
553
554                ### Heading 3
555
556                #### Heading 4
557
558                ##### Heading 5
559
560                ###### Heading 6
561                "#},
562                vec![
563                    h1("Heading 1"),
564                    h2("Heading 2"),
565                    h3("Heading 3"),
566                    h4("Heading 4"),
567                    h5("Heading 5"),
568                    h6("Heading 6"),
569                ],
570            ),
571            (
572                indoc! {r#"Paragraph
573
574                > BlockQuote
575                >
576                > - List item in BlockQuote
577                "#},
578                vec![
579                    p("Paragraph"),
580                    blockquote(vec![
581                        p("BlockQuote"),
582                        Node::Paragraph {
583                            text: Text::default(),
584                        },
585                        item("List item in BlockQuote"),
586                    ]),
587                ],
588            ),
589            // TODO: Implement correct test case when `- [?] ` task item syntax is supported
590            // Now we interpret it as a regular paragraph
591            (
592                indoc! { r#"## Tasks
593
594                - [ ] Task
595
596                - [x] Completed task
597
598                - [?] Completed task
599                "#},
600                vec![
601                    h2("Tasks"),
602                    task("Task"),
603                    completed_task("Completed task"),
604                    p("[?] Completed task"),
605                ],
606            ),
607        ];
608
609        tests
610            .iter()
611            .for_each(|test| assert_eq!(from_str(test.0), test.1));
612    }
613}