basalt_core/markdown.rs
1//! A Markdown parser that transforms Markdown input into a custom abstract syntax tree (AST)
2//! intented to be rendered with [basalt](https://github.com/erikjuhani/basalt)—a TUI application
3//! for Obsidian.
4//!
5//! This module provides a [`Parser`] type, which processes raw Markdown input into a [`Vec`] of
6//! [`Node`]s. These [`Node`]s represent semantic elements such as headings, paragraphs, block
7//! quotes, and code blocks.
8//!
9//! The parser is built on top of [`pulldown_cmark`].
10//!
11//! ## Simple usage
12//!
13//! At the simplest level, you can parse a Markdown string by calling the [`from_str`] function:
14//!
15//! ```
16//! use basalt_core::markdown::{from_str, Node, HeadingLevel, Text};
17//!
18//! let markdown = "# My Heading\n\nSome text.";
19//! let nodes = from_str(markdown);
20//!
21//! assert_eq!(nodes, vec![
22//! Node::Heading {
23//! level: HeadingLevel::H1,
24//! text: Text::from("My Heading"),
25//! },
26//! Node::Paragraph {
27//! text: Text::from("Some text."),
28//! },
29//! ])
30//! ```
31//!
32//! ## Implementation details
33//!
34//! The [`Parser`] processes [`pulldown_cmark::Event`]s one by one, building up the current
35//! [`Node`] in `current_node`. When an event indicates the start of a new structure (e.g.,
36//! `Event::Start(Tag::Heading {..})`), the [`Parser`] pushes or replaces the current node
37//! with a new one. When an event indicates the end of that structure, the node is finalized
38//! and pushed into [`Parser::output`].
39//!
40//! Unrecognized events (such as [`InlineHtml`](pulldown_cmark::Event::InlineHtml)) are simply
41//! ignored for the time being.
42//!
43//! ## Not yet implemented
44//!
45//! - Handling of inline HTML, math blocks, etc.
46//! - Tracking code block language (`lang`) properly (currently set to [`None`]).
47use std::vec::IntoIter;
48
49use pulldown_cmark::{Event, Options, Tag, TagEnd};
50
51/// A style that can be applied to [`TextNode`] (code, emphasis, strikethrough, strong).
52#[derive(Clone, Debug, PartialEq)]
53pub enum Style {
54 /// Inline code style (e.g. `code`).
55 Code,
56 /// Italic/emphasis style (e.g. `*emphasis*`).
57 Emphasis,
58 /// Strikethrough style (e.g. `~~strikethrough~~`).
59 Strikethrough,
60 /// Bold/strong style (e.g. `**strong**`).
61 Strong,
62}
63
64/// Represents the variant of a list or task item (checked, unchecked, etc.).
65#[derive(Clone, Debug, PartialEq)]
66pub enum ItemKind {
67 /// A checkbox item that is marked as done using `- [x]`.
68 HardChecked,
69 /// A checkbox item that is checked, but not explicitly recognized as
70 /// `HardChecked` (e.g., `- [?]`).
71 Checked,
72 /// A checkbox item that is unchecked using `- [ ]`.
73 Unchecked,
74 // TODO: Remove in favor of using List node that has children of nodes
75 /// An ordered list item (e.g., `1. item`), storing the numeric index.
76 Ordered(u64),
77 /// An unordered list item (e.g., `- item`).
78 Unordered,
79}
80
81#[derive(Clone, Debug, PartialEq)]
82#[allow(missing_docs)]
83pub enum HeadingLevel {
84 H1 = 1,
85 H2,
86 H3,
87 H4,
88 H5,
89 H6,
90}
91
92impl From<pulldown_cmark::HeadingLevel> for HeadingLevel {
93 fn from(value: pulldown_cmark::HeadingLevel) -> Self {
94 match value {
95 pulldown_cmark::HeadingLevel::H1 => HeadingLevel::H1,
96 pulldown_cmark::HeadingLevel::H2 => HeadingLevel::H2,
97 pulldown_cmark::HeadingLevel::H3 => HeadingLevel::H3,
98 pulldown_cmark::HeadingLevel::H4 => HeadingLevel::H4,
99 pulldown_cmark::HeadingLevel::H5 => HeadingLevel::H5,
100 pulldown_cmark::HeadingLevel::H6 => HeadingLevel::H6,
101 }
102 }
103}
104
105/// Represents specialized block quote kind variants (tip, note, warning, etc.).
106///
107/// Currently, the underlying [`pulldown_cmark`] parser distinguishes these via syntax like `">
108/// [!NOTE] Some note"`.
109#[derive(Clone, Debug, PartialEq)]
110#[allow(missing_docs)]
111pub enum BlockQuoteKind {
112 Note,
113 Tip,
114 Important,
115 Warning,
116 Caution,
117}
118
119impl From<pulldown_cmark::BlockQuoteKind> for BlockQuoteKind {
120 fn from(value: pulldown_cmark::BlockQuoteKind) -> Self {
121 match value {
122 pulldown_cmark::BlockQuoteKind::Tip => BlockQuoteKind::Tip,
123 pulldown_cmark::BlockQuoteKind::Note => BlockQuoteKind::Note,
124 pulldown_cmark::BlockQuoteKind::Warning => BlockQuoteKind::Warning,
125 pulldown_cmark::BlockQuoteKind::Caution => BlockQuoteKind::Caution,
126 pulldown_cmark::BlockQuoteKind::Important => BlockQuoteKind::Important,
127 }
128 }
129}
130
131/// Denotes whether a list is ordered or unordered.
132#[derive(Clone, Debug, PartialEq)]
133pub enum ListKind {
134 /// An ordered list item (e.g., `1. item`), storing the numeric index.
135 Ordered(u64),
136 /// An unordered list item (e.g., `- item`).
137 Unordered,
138}
139
140/// A single unit of text that is optionally styled (e.g., code).
141///
142/// [`TextNode`] can be any combination of sentence, words or characters.
143///
144/// Usually styled text will be contained in a single [`TextNode`] with the given [`Style`]
145/// property.
146#[derive(Clone, Debug, PartialEq, Default)]
147pub struct TextNode {
148 /// The literal text content.
149 pub content: String,
150 /// Optional inline style of the text.
151 pub style: Option<Style>,
152}
153
154impl TextNode {
155 /// Creates a new [`TextNode`] from `content` and optional [`Style`].
156 pub fn new(content: String, style: Option<Style>) -> Self {
157 Self { content, style }
158 }
159}
160
161/// A wrapper type holding a list of [`TextNode`]s.
162#[derive(Clone, Debug, PartialEq, Default)]
163pub struct Text(Vec<TextNode>);
164
165impl From<&str> for Text {
166 fn from(value: &str) -> Self {
167 Self(vec![TextNode::new(String::from(value), None)])
168 }
169}
170
171impl IntoIterator for Text {
172 type Item = TextNode;
173 type IntoIter = IntoIter<Self::Item>;
174 fn into_iter(self) -> Self::IntoIter {
175 self.0.into_iter()
176 }
177}
178
179impl Text {
180 /// Appends a [`TextNode`] to the inner text list.
181 fn push(&mut self, node: TextNode) {
182 self.0.push(node);
183 }
184}
185
186/// The Markdown AST node enumeration.
187#[derive(Clone, Debug, PartialEq)]
188#[allow(missing_docs)]
189pub enum Node {
190 /// A heading node that represents different heading levels.
191 ///
192 /// The level is controlled with the [`HeadingLevel`] definition.
193 Heading {
194 level: HeadingLevel,
195 text: Text,
196 },
197 Paragraph {
198 text: Text,
199 },
200 /// A block quote node that represents different quote block variants including callout blocks.
201 ///
202 /// The variant is controlled with the [`BlockQuoteKind`] definition. When [`BlockQuoteKind`]
203 /// is [`None`] the block quote should be interpreted as a regular block quote:
204 /// `"> Block quote"`.
205 BlockQuote {
206 kind: Option<BlockQuoteKind>,
207 nodes: Vec<Node>,
208 },
209 /// A fenced code block, optionally with a language identifier.
210 CodeBlock {
211 lang: Option<String>,
212 text: Text,
213 },
214 /// A list item node that represents different list item variants including task items.
215 ///
216 /// The variant is controlled with the [`ItemKind`] definition. When [`ItemKind`] is [`None`]
217 /// the item should be interpreted as unordered list item: `"- Item"`.
218 Item {
219 kind: Option<ItemKind>,
220 text: Text,
221 },
222}
223
224impl Node {
225 /// Pushes a [`TextNode`] into this node, if it contains a text buffer.
226 ///
227 /// If the node is a [`BlockQuote`], the [`TextNode`] will be pushed into the last child
228 /// [`Node`], if any.
229 /// ```
230 pub(crate) fn push_text_node(&mut self, node: TextNode) {
231 match self {
232 Node::Paragraph { text, .. }
233 | Node::Heading { text, .. }
234 | Node::CodeBlock { text, .. }
235 | Node::Item { text, .. } => text.push(node),
236 Node::BlockQuote { nodes, .. } => {
237 if let Some(last_node) = nodes.last_mut() {
238 last_node.push_text_node(node);
239 }
240 }
241 }
242 }
243}
244
245/// Returns `true` if the [`Node`] should be closed upon encountering the given [`TagEnd`].
246fn matches_tag_end(node: &Node, tag_end: &TagEnd) -> bool {
247 match (node, tag_end) {
248 (Node::Paragraph { .. }, TagEnd::Paragraph)
249 | (Node::Heading { .. }, TagEnd::Heading(..))
250 | (Node::BlockQuote { .. }, TagEnd::BlockQuote(..))
251 | (Node::CodeBlock { .. }, TagEnd::CodeBlock)
252 | (Node::Item { .. }, TagEnd::Item) => true,
253 _ => false,
254 }
255}
256
257/// Parses the given Markdown input into a list of [`Node`]s.
258///
259/// This is a convenience function for constructing a [`Parser`] and calling [`Parser::parse`].
260///
261/// # Examples
262///
263/// ```
264/// use basalt_core::markdown::{from_str, Node, HeadingLevel, Text};
265///
266/// let markdown = "# My Heading\n\nSome text.";
267/// let nodes = from_str(markdown);
268///
269/// assert_eq!(nodes, vec![
270/// Node::Heading {
271/// level: HeadingLevel::H1,
272/// text: Text::from("My Heading"),
273/// },
274/// Node::Paragraph {
275/// text: Text::from("Some text."),
276/// },
277/// ])
278/// ```
279pub fn from_str<'a>(text: &'a str) -> Vec<Node> {
280 Parser::new(text).parse()
281}
282
283/// A parser that consumes [`pulldown_cmark::Event`]s and produces a [`Vec`] of [`Node`].
284///
285/// # Examples
286///
287/// ```
288/// use basalt_core::markdown::{Parser, Node, HeadingLevel, Text};
289///
290/// let markdown = "# My Heading\n\nSome text.";
291/// let parser = Parser::new(markdown);
292/// let nodes = parser.parse();
293///
294/// assert_eq!(nodes, vec![
295/// Node::Heading {
296/// level: HeadingLevel::H1,
297/// text: Text::from("My Heading"),
298/// },
299/// Node::Paragraph {
300/// text: Text::from("Some text."),
301/// },
302/// ])
303/// ```
304pub struct Parser<'a> {
305 /// Contains the completed AST [`Node`]s.
306 pub output: Vec<Node>,
307 inner: pulldown_cmark::TextMergeStream<'a, pulldown_cmark::Parser<'a>>,
308 current_node: Option<Node>,
309}
310
311impl<'a> Iterator for Parser<'a> {
312 type Item = Event<'a>;
313 fn next(&mut self) -> Option<Self::Item> {
314 self.inner.next()
315 }
316}
317
318impl<'a> Parser<'a> {
319 /// Creates a new [`Parser`] from a Markdown input string.
320 ///
321 /// The parser uses [`pulldown_cmark::Parser::new_ext`] with [`Options::all()`] and
322 /// [`pulldown_cmark::TextMergeStream`] internally.
323 pub fn new(text: &'a str) -> Self {
324 let parser = pulldown_cmark::TextMergeStream::new(pulldown_cmark::Parser::new_ext(
325 text,
326 Options::all(),
327 ));
328
329 Self {
330 inner: parser,
331 output: vec![],
332 current_node: None,
333 }
334 }
335
336 /// Pushes a [`Node`] as a child if the current node is a [`BlockQuote`], otherwise sets it as
337 /// the `current_node`.
338 fn push_node(&mut self, node: Node) {
339 if let Some(Node::BlockQuote { nodes, .. }) = &mut self.current_node {
340 nodes.push(node);
341 } else {
342 self.set_node(&node);
343 }
344 }
345
346 /// Pushes a [`TextNode`] into the `current_node` if it exists.
347 fn push_text_node(&mut self, node: TextNode) {
348 if let Some(ref mut current) = self.current_node {
349 current.push_text_node(node);
350 }
351 }
352
353 /// Sets (or replaces) the `current_node` with a new one, discarding any old node.
354 fn set_node(&mut self, block: &Node) {
355 self.current_node.replace(block.clone());
356 }
357
358 /// Handles the start of a [`Tag`]. Pushes the matching semantic node to be processed.
359 fn tag(&mut self, tag: Tag<'a>) {
360 match tag {
361 Tag::Paragraph => self.push_node(Node::Paragraph {
362 text: Text::default(),
363 }),
364 Tag::Heading { level, .. } => self.push_node(Node::Heading {
365 level: level.into(),
366 text: Text::default(),
367 }),
368 Tag::BlockQuote(kind) => self.push_node(Node::BlockQuote {
369 kind: kind.map(|kind| kind.into()),
370 nodes: vec![],
371 }),
372 Tag::CodeBlock(_) => self.push_node(Node::CodeBlock {
373 lang: None,
374 text: Text::default(),
375 }),
376 Tag::Item => self.push_node(Node::Item {
377 kind: None,
378 text: Text::default(),
379 }),
380 // For now everything below this comment are defined as paragraph nodes
381 Tag::HtmlBlock
382 | Tag::List(_)
383 | Tag::FootnoteDefinition(_)
384 | Tag::Table(_)
385 | Tag::TableHead
386 | Tag::TableRow
387 | Tag::TableCell
388 | Tag::Emphasis
389 | Tag::Strong
390 | Tag::Strikethrough
391 | Tag::Link { .. }
392 | Tag::Image { .. }
393 | Tag::MetadataBlock(_)
394 | Tag::DefinitionList
395 | Tag::DefinitionListTitle
396 | Tag::DefinitionListDefinition => self.push_node(Node::Paragraph {
397 text: Text::default(),
398 }),
399 }
400 }
401
402 /// Handles the end of a [`Tag`], finalizing a node if matching.
403 fn tag_end(&mut self, tag_end: TagEnd) {
404 let Some(node) = self.current_node.take() else {
405 return;
406 };
407
408 if matches_tag_end(&node, &tag_end) {
409 self.output.push(node);
410 } else {
411 self.set_node(&node);
412 }
413 }
414
415 /// Processes a single [`Event`] from the underlying [`pulldown_cmark::Parser`] iterator.
416 fn handle_event(&mut self, event: Event<'a>) {
417 match event {
418 Event::Start(tag) => self.tag(tag),
419 Event::End(tag_end) => self.tag_end(tag_end),
420 Event::Text(text) => self.push_text_node(TextNode::new(text.to_string(), None)),
421 Event::Code(text) => {
422 self.push_text_node(TextNode::new(text.to_string(), Some(Style::Code)))
423 }
424 Event::TaskListMarker(checked) => {
425 if checked {
426 self.set_node(&Node::Item {
427 kind: Some(ItemKind::HardChecked),
428 text: Text::default(),
429 });
430 } else {
431 self.set_node(&Node::Item {
432 kind: Some(ItemKind::Unchecked),
433 text: Text::default(),
434 });
435 }
436 }
437 Event::InlineMath(_)
438 | Event::DisplayMath(_)
439 | Event::Html(_)
440 | Event::InlineHtml(_)
441 | Event::SoftBreak
442 | Event::HardBreak
443 | Event::Rule
444 | Event::FootnoteReference(_) => {
445 // TODO: Not yet implemented
446 }
447 }
448 }
449
450 /// Consumes the parser, processing all remaining events from the stream into a list of
451 /// [`Node`]s.
452 ///
453 /// # Examples
454 ///
455 /// ```
456 /// # use basalt_core::markdown::{Parser, Node, Text};
457 /// let parser = Parser::new("Hello world");
458 ///
459 /// let nodes = parser.parse();
460 ///
461 /// assert_eq!(nodes, vec![Node::Paragraph { text: Text::from("Hello world") }]);
462 /// ```
463 pub fn parse(mut self) -> Vec<Node> {
464 while let Some(event) = self.next() {
465 self.handle_event(event);
466 }
467
468 if let Some(node) = self.current_node.take() {
469 self.output.push(node);
470 }
471
472 self.output
473 }
474}
475
476#[cfg(test)]
477mod tests {
478 use indoc::indoc;
479
480 fn text(str: &str) -> Text {
481 Text(vec![TextNode::new(String::from(str), None)])
482 }
483
484 fn p(str: &str) -> Node {
485 Node::Paragraph { text: text(str) }
486 }
487
488 fn blockquote(nodes: Vec<Node>) -> Node {
489 Node::BlockQuote { kind: None, nodes }
490 }
491
492 fn item(str: &str) -> Node {
493 Node::Item {
494 kind: None,
495 text: text(str),
496 }
497 }
498
499 fn task(str: &str) -> Node {
500 Node::Item {
501 kind: Some(ItemKind::Unchecked),
502 text: text(str),
503 }
504 }
505
506 fn completed_task(str: &str) -> Node {
507 Node::Item {
508 kind: Some(ItemKind::HardChecked),
509 text: text(str),
510 }
511 }
512
513 fn heading(level: HeadingLevel, str: &str) -> Node {
514 Node::Heading {
515 level,
516 text: text(str),
517 }
518 }
519
520 fn h1(str: &str) -> Node {
521 heading(HeadingLevel::H1, str)
522 }
523
524 fn h2(str: &str) -> Node {
525 heading(HeadingLevel::H2, str)
526 }
527
528 fn h3(str: &str) -> Node {
529 heading(HeadingLevel::H3, str)
530 }
531
532 fn h4(str: &str) -> Node {
533 heading(HeadingLevel::H4, str)
534 }
535
536 fn h5(str: &str) -> Node {
537 heading(HeadingLevel::H5, str)
538 }
539
540 fn h6(str: &str) -> Node {
541 heading(HeadingLevel::H6, str)
542 }
543
544 use super::*;
545
546 #[test]
547 fn test_parse() {
548 let tests = [
549 (
550 indoc! {r#"# Heading 1
551
552 ## Heading 2
553
554 ### Heading 3
555
556 #### Heading 4
557
558 ##### Heading 5
559
560 ###### Heading 6
561 "#},
562 vec![
563 h1("Heading 1"),
564 h2("Heading 2"),
565 h3("Heading 3"),
566 h4("Heading 4"),
567 h5("Heading 5"),
568 h6("Heading 6"),
569 ],
570 ),
571 (
572 indoc! {r#"Paragraph
573
574 > BlockQuote
575 >
576 > - List item in BlockQuote
577 "#},
578 vec![
579 p("Paragraph"),
580 blockquote(vec![
581 p("BlockQuote"),
582 Node::Paragraph {
583 text: Text::default(),
584 },
585 item("List item in BlockQuote"),
586 ]),
587 ],
588 ),
589 // TODO: Implement correct test case when `- [?] ` task item syntax is supported
590 // Now we interpret it as a regular paragraph
591 (
592 indoc! { r#"## Tasks
593
594 - [ ] Task
595
596 - [x] Completed task
597
598 - [?] Completed task
599 "#},
600 vec![
601 h2("Tasks"),
602 task("Task"),
603 completed_task("Completed task"),
604 p("[?] Completed task"),
605 ],
606 ),
607 ];
608
609 tests
610 .iter()
611 .for_each(|test| assert_eq!(from_str(test.0), test.1));
612 }
613}