basalt_core/
markdown.rs

1//! A Markdown parser that transforms Markdown input into a custom abstract syntax tree (AST)
2//! intented to be rendered with [basalt](https://github.com/erikjuhani/basalt)—a TUI application
3//! for Obsidian.
4//!
5//! This module provides a [`Parser`] type, which processes raw Markdown input into a [`Vec`] of
6//! [`Node`]s. These [`Node`]s represent semantic elements such as headings, paragraphs, block
7//! quotes, and code blocks.
8//!
9//! The parser is built on top of [`pulldown_cmark`].
10//!
11//! ## Simple usage
12//!
13//! At the simplest level, you can parse a Markdown string by calling the [`from_str`] function:
14//!
15//! ```
16//! use basalt_core::markdown::{from_str, Node, HeadingLevel, Text};
17//!
18//! let markdown = "# My Heading\n\nSome text.";
19//! let nodes = from_str(markdown);
20//!
21//! assert_eq!(nodes, vec![
22//!   Node::Heading {
23//!     level: HeadingLevel::H1,
24//!     text: Text::from("My Heading"),
25//!   },
26//!   Node::Paragraph {
27//!     text: Text::from("Some text."),
28//!   },
29//! ])
30//! ```
31//!
32//! ## Implementation details
33//!
34//! The [`Parser`] processes [`pulldown_cmark::Event`]s one by one, building up the current
35//! [`Node`] in `current_node`. When an event indicates the start of a new structure (e.g.,
36//! `Event::Start(Tag::Heading {..})`), the [`Parser`] pushes or replaces the current node
37//! with a new one. When an event indicates the end of that structure, the node is finalized
38//! and pushed into [`Parser::output`].
39//!
40//! Unrecognized events (such as [`InlineHtml`](pulldown_cmark::Event::InlineHtml)) are simply
41//! ignored for the time being.
42//!
43//! ## Not yet implemented
44//!
45//! - Handling of inline HTML, math blocks, etc.
46//! - Tracking code block language (`lang`) properly (currently set to [`None`]).
47use std::vec::IntoIter;
48
49use pulldown_cmark::{Event, Options, Tag, TagEnd};
50
51/// A style that can be applied to [`TextNode`] (code, emphasis, strikethrough, strong).
52#[derive(Clone, Debug, PartialEq)]
53pub enum Style {
54    /// Inline code style (e.g. `code`).
55    Code,
56    /// Italic/emphasis style (e.g. `*emphasis*`).
57    Emphasis,
58    /// Strikethrough style (e.g. `~~strikethrough~~`).
59    Strikethrough,
60    /// Bold/strong style (e.g. `**strong**`).
61    Strong,
62}
63
64/// Represents the variant of a list or task item (checked, unchecked, etc.).
65#[derive(Clone, Debug, PartialEq)]
66pub enum ItemKind {
67    /// A checkbox item that is marked as done using `- [x]`.
68    HardChecked,
69    /// A checkbox item that is checked, but not explicitly recognized as
70    /// `HardChecked` (e.g., `- [?]`).
71    Checked,
72    /// A checkbox item that is unchecked using `- [ ]`.
73    Unchecked,
74    // TODO: Remove in favor of using List node that has children of nodes
75    /// An ordered list item (e.g., `1. item`), storing the numeric index.
76    Ordered(u64),
77    /// An unordered list item (e.g., `- item`).
78    Unordered,
79}
80
81#[derive(Clone, Debug, PartialEq)]
82#[allow(missing_docs)]
83pub enum HeadingLevel {
84    H1 = 1,
85    H2,
86    H3,
87    H4,
88    H5,
89    H6,
90}
91
92impl From<pulldown_cmark::HeadingLevel> for HeadingLevel {
93    fn from(value: pulldown_cmark::HeadingLevel) -> Self {
94        match value {
95            pulldown_cmark::HeadingLevel::H1 => HeadingLevel::H1,
96            pulldown_cmark::HeadingLevel::H2 => HeadingLevel::H2,
97            pulldown_cmark::HeadingLevel::H3 => HeadingLevel::H3,
98            pulldown_cmark::HeadingLevel::H4 => HeadingLevel::H4,
99            pulldown_cmark::HeadingLevel::H5 => HeadingLevel::H5,
100            pulldown_cmark::HeadingLevel::H6 => HeadingLevel::H6,
101        }
102    }
103}
104
105/// Represents specialized block quote kind variants (tip, note, warning, etc.).
106///
107/// Currently, the underlying [`pulldown_cmark`] parser distinguishes these via syntax like `">
108/// [!NOTE] Some note"`.
109#[derive(Clone, Debug, PartialEq)]
110#[allow(missing_docs)]
111pub enum BlockQuoteKind {
112    Note,
113    Tip,
114    Important,
115    Warning,
116    Caution,
117}
118
119impl From<pulldown_cmark::BlockQuoteKind> for BlockQuoteKind {
120    fn from(value: pulldown_cmark::BlockQuoteKind) -> Self {
121        match value {
122            pulldown_cmark::BlockQuoteKind::Tip => BlockQuoteKind::Tip,
123            pulldown_cmark::BlockQuoteKind::Note => BlockQuoteKind::Note,
124            pulldown_cmark::BlockQuoteKind::Warning => BlockQuoteKind::Warning,
125            pulldown_cmark::BlockQuoteKind::Caution => BlockQuoteKind::Caution,
126            pulldown_cmark::BlockQuoteKind::Important => BlockQuoteKind::Important,
127        }
128    }
129}
130
131/// Denotes whether a list is ordered or unordered.
132#[derive(Clone, Debug, PartialEq)]
133pub enum ListKind {
134    /// An ordered list item (e.g., `1. item`), storing the numeric index.
135    Ordered(u64),
136    /// An unordered list item (e.g., `- item`).
137    Unordered,
138}
139
140/// A single unit of text that is optionally styled (e.g., code).
141///
142/// [`TextNode`] can be any combination of sentence, words or characters.
143///
144/// Usually styled text will be contained in a single [`TextNode`] with the given [`Style`]
145/// property.
146#[derive(Clone, Debug, PartialEq, Default)]
147pub struct TextNode {
148    /// The literal text content.
149    pub content: String,
150    /// Optional inline style of the text.
151    pub style: Option<Style>,
152}
153
154impl From<&str> for TextNode {
155    fn from(value: &str) -> Self {
156        value.to_string().into()
157    }
158}
159
160impl From<String> for TextNode {
161    fn from(value: String) -> Self {
162        Self {
163            content: value,
164            ..Default::default()
165        }
166    }
167}
168
169impl TextNode {
170    /// Creates a new [`TextNode`] from `content` and optional [`Style`].
171    pub fn new(content: String, style: Option<Style>) -> Self {
172        Self { content, style }
173    }
174}
175
176/// A wrapper type holding a list of [`TextNode`]s.
177#[derive(Clone, Debug, PartialEq, Default)]
178pub struct Text(Vec<TextNode>);
179
180impl From<&str> for Text {
181    fn from(value: &str) -> Self {
182        TextNode::from(value).into()
183    }
184}
185
186impl From<String> for Text {
187    fn from(value: String) -> Self {
188        TextNode::from(value).into()
189    }
190}
191
192impl From<TextNode> for Text {
193    fn from(value: TextNode) -> Self {
194        Self([value].to_vec())
195    }
196}
197
198impl From<Vec<TextNode>> for Text {
199    fn from(value: Vec<TextNode>) -> Self {
200        Self(value)
201    }
202}
203
204impl From<&[TextNode]> for Text {
205    fn from(value: &[TextNode]) -> Self {
206        Self(value.to_vec())
207    }
208}
209
210impl IntoIterator for Text {
211    type Item = TextNode;
212    type IntoIter = IntoIter<Self::Item>;
213    fn into_iter(self) -> Self::IntoIter {
214        self.0.into_iter()
215    }
216}
217
218impl Text {
219    /// Appends a [`TextNode`] to the inner text list.
220    fn push(&mut self, node: TextNode) {
221        self.0.push(node);
222    }
223}
224
225/// The Markdown AST node enumeration.
226#[derive(Clone, Debug, PartialEq)]
227#[allow(missing_docs)]
228pub enum Node {
229    /// A heading node that represents different heading levels.
230    ///
231    /// The level is controlled with the [`HeadingLevel`] definition.
232    Heading {
233        level: HeadingLevel,
234        text: Text,
235    },
236    Paragraph {
237        text: Text,
238    },
239    /// A block quote node that represents different quote block variants including callout blocks.
240    ///
241    /// The variant is controlled with the [`BlockQuoteKind`] definition. When [`BlockQuoteKind`]
242    /// is [`None`] the block quote should be interpreted as a regular block quote:
243    /// `"> Block quote"`.
244    BlockQuote {
245        kind: Option<BlockQuoteKind>,
246        nodes: Vec<Node>,
247    },
248    /// A fenced code block, optionally with a language identifier.
249    CodeBlock {
250        lang: Option<String>,
251        text: Text,
252    },
253    /// A list item node that represents different list item variants including task items.
254    ///
255    /// The variant is controlled with the [`ItemKind`] definition. When [`ItemKind`] is [`None`]
256    /// the item should be interpreted as unordered list item: `"- Item"`.
257    Item {
258        kind: Option<ItemKind>,
259        text: Text,
260    },
261}
262
263impl Node {
264    /// Pushes a [`TextNode`] into this node, if it contains a text buffer.
265    ///
266    /// If the node is a [`BlockQuote`], the [`TextNode`] will be pushed into the last child
267    /// [`Node`], if any.
268    /// ```
269    pub(crate) fn push_text_node(&mut self, node: TextNode) {
270        match self {
271            Node::Paragraph { text, .. }
272            | Node::Heading { text, .. }
273            | Node::CodeBlock { text, .. }
274            | Node::Item { text, .. } => text.push(node),
275            Node::BlockQuote { nodes, .. } => {
276                if let Some(last_node) = nodes.last_mut() {
277                    last_node.push_text_node(node);
278                }
279            }
280        }
281    }
282}
283
284/// Returns `true` if the [`Node`] should be closed upon encountering the given [`TagEnd`].
285fn matches_tag_end(node: &Node, tag_end: &TagEnd) -> bool {
286    match (node, tag_end) {
287        (Node::Paragraph { .. }, TagEnd::Paragraph)
288        | (Node::Heading { .. }, TagEnd::Heading(..))
289        | (Node::BlockQuote { .. }, TagEnd::BlockQuote(..))
290        | (Node::CodeBlock { .. }, TagEnd::CodeBlock)
291        | (Node::Item { .. }, TagEnd::Item) => true,
292        _ => false,
293    }
294}
295
296/// Parses the given Markdown input into a list of [`Node`]s.
297///
298/// This is a convenience function for constructing a [`Parser`] and calling [`Parser::parse`].  
299///
300/// # Examples
301///
302/// ```
303/// use basalt_core::markdown::{from_str, Node, HeadingLevel, Text};
304///
305/// let markdown = "# My Heading\n\nSome text.";
306/// let nodes = from_str(markdown);
307///
308/// assert_eq!(nodes, vec![
309///   Node::Heading {
310///     level: HeadingLevel::H1,
311///     text: Text::from("My Heading"),
312///   },
313///   Node::Paragraph {
314///     text: Text::from("Some text."),
315///   },
316/// ])
317/// ```
318pub fn from_str<'a>(text: &'a str) -> Vec<Node> {
319    Parser::new(text).parse()
320}
321
322/// A parser that consumes [`pulldown_cmark::Event`]s and produces a [`Vec`] of [`Node`].
323///
324/// # Examples
325///
326/// ```
327/// use basalt_core::markdown::{Parser, Node, HeadingLevel, Text};
328///
329/// let markdown = "# My Heading\n\nSome text.";
330/// let parser = Parser::new(markdown);
331/// let nodes = parser.parse();
332///
333/// assert_eq!(nodes, vec![
334///   Node::Heading {
335///     level: HeadingLevel::H1,
336///     text: Text::from("My Heading"),
337///   },
338///   Node::Paragraph {
339///     text: Text::from("Some text."),
340///   },
341/// ])
342/// ```
343pub struct Parser<'a> {
344    /// Contains the completed AST [`Node`]s.
345    pub output: Vec<Node>,
346    inner: pulldown_cmark::TextMergeStream<'a, pulldown_cmark::Parser<'a>>,
347    current_node: Option<Node>,
348}
349
350impl<'a> Iterator for Parser<'a> {
351    type Item = Event<'a>;
352    fn next(&mut self) -> Option<Self::Item> {
353        self.inner.next()
354    }
355}
356
357impl<'a> Parser<'a> {
358    /// Creates a new [`Parser`] from a Markdown input string.
359    ///
360    /// The parser uses [`pulldown_cmark::Parser::new_ext`] with [`Options::all()`] and
361    /// [`pulldown_cmark::TextMergeStream`] internally.
362    pub fn new(text: &'a str) -> Self {
363        let parser = pulldown_cmark::TextMergeStream::new(pulldown_cmark::Parser::new_ext(
364            text,
365            Options::all(),
366        ));
367
368        Self {
369            inner: parser,
370            output: vec![],
371            current_node: None,
372        }
373    }
374
375    /// Pushes a [`Node`] as a child if the current node is a [`BlockQuote`], otherwise sets it as
376    /// the `current_node`.
377    fn push_node(&mut self, node: Node) {
378        if let Some(Node::BlockQuote { nodes, .. }) = &mut self.current_node {
379            nodes.push(node);
380        } else {
381            self.set_node(&node);
382        }
383    }
384
385    /// Pushes a [`TextNode`] into the `current_node` if it exists.
386    fn push_text_node(&mut self, node: TextNode) {
387        if let Some(ref mut current) = self.current_node {
388            current.push_text_node(node);
389        }
390    }
391
392    /// Sets (or replaces) the `current_node` with a new one, discarding any old node.
393    fn set_node(&mut self, block: &Node) {
394        self.current_node.replace(block.clone());
395    }
396
397    /// Handles the start of a [`Tag`]. Pushes the matching semantic node to be processed.
398    fn tag(&mut self, tag: Tag<'a>) {
399        match tag {
400            Tag::Paragraph => self.push_node(Node::Paragraph {
401                text: Text::default(),
402            }),
403            Tag::Heading { level, .. } => self.push_node(Node::Heading {
404                level: level.into(),
405                text: Text::default(),
406            }),
407            Tag::BlockQuote(kind) => self.push_node(Node::BlockQuote {
408                kind: kind.map(|kind| kind.into()),
409                nodes: vec![],
410            }),
411            Tag::CodeBlock(_) => self.push_node(Node::CodeBlock {
412                lang: None,
413                text: Text::default(),
414            }),
415            Tag::Item => self.push_node(Node::Item {
416                kind: None,
417                text: Text::default(),
418            }),
419            // For now everything below this comment are defined as paragraph nodes
420            Tag::HtmlBlock
421            | Tag::List(_)
422            | Tag::FootnoteDefinition(_)
423            | Tag::Table(_)
424            | Tag::TableHead
425            | Tag::TableRow
426            | Tag::TableCell
427            | Tag::Emphasis
428            | Tag::Strong
429            | Tag::Strikethrough
430            | Tag::Link { .. }
431            | Tag::Image { .. }
432            | Tag::MetadataBlock(_)
433            | Tag::DefinitionList
434            | Tag::DefinitionListTitle
435            | Tag::DefinitionListDefinition => {}
436        }
437    }
438
439    /// Handles the end of a [`Tag`], finalizing a node if matching.
440    fn tag_end(&mut self, tag_end: TagEnd) {
441        let Some(node) = self.current_node.take() else {
442            return;
443        };
444
445        if matches_tag_end(&node, &tag_end) {
446            self.output.push(node);
447        } else {
448            self.set_node(&node);
449        }
450    }
451
452    /// Processes a single [`Event`] from the underlying [`pulldown_cmark::Parser`] iterator.
453    fn handle_event(&mut self, event: Event<'a>) {
454        match event {
455            Event::Start(tag) => self.tag(tag),
456            Event::End(tag_end) => self.tag_end(tag_end),
457            Event::Text(text) => self.push_text_node(TextNode::new(text.to_string(), None)),
458            Event::Code(text) => {
459                self.push_text_node(TextNode::new(text.to_string(), Some(Style::Code)))
460            }
461            Event::TaskListMarker(checked) => {
462                if checked {
463                    self.set_node(&Node::Item {
464                        kind: Some(ItemKind::HardChecked),
465                        text: Text::default(),
466                    });
467                } else {
468                    self.set_node(&Node::Item {
469                        kind: Some(ItemKind::Unchecked),
470                        text: Text::default(),
471                    });
472                }
473            }
474            Event::InlineMath(_)
475            | Event::DisplayMath(_)
476            | Event::Html(_)
477            | Event::InlineHtml(_)
478            | Event::SoftBreak
479            | Event::HardBreak
480            | Event::Rule
481            | Event::FootnoteReference(_) => {
482                // TODO: Not yet implemented
483            }
484        }
485    }
486
487    /// Consumes the parser, processing all remaining events from the stream into a list of
488    /// [`Node`]s.
489    ///
490    /// # Examples
491    ///
492    /// ```
493    /// # use basalt_core::markdown::{Parser, Node, Text};
494    /// let parser = Parser::new("Hello world");
495    ///
496    /// let nodes = parser.parse();
497    ///
498    /// assert_eq!(nodes, vec![Node::Paragraph { text: Text::from("Hello world") }]);
499    /// ```
500    pub fn parse(mut self) -> Vec<Node> {
501        while let Some(event) = self.next() {
502            self.handle_event(event);
503        }
504
505        if let Some(node) = self.current_node.take() {
506            self.output.push(node);
507        }
508
509        self.output
510    }
511}
512
513#[cfg(test)]
514mod tests {
515    use indoc::indoc;
516
517    fn p(str: &str) -> Node {
518        Node::Paragraph { text: str.into() }
519    }
520
521    fn blockquote(nodes: Vec<Node>) -> Node {
522        Node::BlockQuote { kind: None, nodes }
523    }
524
525    fn item(str: &str) -> Node {
526        Node::Item {
527            kind: None,
528            text: str.into(),
529        }
530    }
531
532    fn task(str: &str) -> Node {
533        Node::Item {
534            kind: Some(ItemKind::Unchecked),
535            text: str.into(),
536        }
537    }
538
539    fn completed_task(str: &str) -> Node {
540        Node::Item {
541            kind: Some(ItemKind::HardChecked),
542            text: str.into(),
543        }
544    }
545
546    fn heading(level: HeadingLevel, str: &str) -> Node {
547        Node::Heading {
548            level,
549            text: str.into(),
550        }
551    }
552
553    fn h1(str: &str) -> Node {
554        heading(HeadingLevel::H1, str)
555    }
556
557    fn h2(str: &str) -> Node {
558        heading(HeadingLevel::H2, str)
559    }
560
561    fn h3(str: &str) -> Node {
562        heading(HeadingLevel::H3, str)
563    }
564
565    fn h4(str: &str) -> Node {
566        heading(HeadingLevel::H4, str)
567    }
568
569    fn h5(str: &str) -> Node {
570        heading(HeadingLevel::H5, str)
571    }
572
573    fn h6(str: &str) -> Node {
574        heading(HeadingLevel::H6, str)
575    }
576
577    use super::*;
578
579    #[test]
580    fn test_parse() {
581        let tests = [
582            (
583                indoc! {r#"# Heading 1
584
585                ## Heading 2
586
587                ### Heading 3
588
589                #### Heading 4
590
591                ##### Heading 5
592
593                ###### Heading 6
594                "#},
595                vec![
596                    h1("Heading 1"),
597                    h2("Heading 2"),
598                    h3("Heading 3"),
599                    h4("Heading 4"),
600                    h5("Heading 5"),
601                    h6("Heading 6"),
602                ],
603            ),
604            // TODO: Implement correct test case when `- [?] ` task item syntax is supported
605            // Now we interpret it as a regular paragraph
606            (
607                indoc! { r#"## Tasks
608
609                - [ ] Task
610
611                - [x] Completed task
612
613                - [?] Completed task
614                "#},
615                vec![
616                    h2("Tasks"),
617                    task("Task"),
618                    completed_task("Completed task"),
619                    p("[?] Completed task"),
620                ],
621            ),
622            (
623                indoc! {r#"## Quotes
624
625                You _can_ quote text by adding a `>` symbols before the text.
626
627                > Human beings face ever more complex and urgent problems, and their effectiveness in dealing with these problems is a matter that is critical to the stability and continued progress of society.
628                >
629                >- Doug Engelbart, 1961
630                "#},
631                vec![
632                    h2("Quotes"),
633                    Node::Paragraph {
634                        text: vec![
635                            TextNode::new("You ".into(), None),
636                            TextNode::new("can".into(), None),
637                            TextNode::new(" quote text by adding a ".into(), None),
638                            TextNode::new(">".into(), Some(Style::Code)),
639                            TextNode::new(" symbols before the text.".into(), None),
640                        ]
641                        .into(),
642                    },
643                    blockquote(vec![
644                        p("Human beings face ever more complex and urgent problems, and their effectiveness in dealing with these problems is a matter that is critical to the stability and continued progress of society."),
645                        item("Doug Engelbart, 1961")
646                    ]),
647                ],
648            ),
649        ];
650
651        tests
652            .iter()
653            .for_each(|test| assert_eq!(from_str(test.0), test.1));
654    }
655}