basalt_core/markdown.rs
1//! A Markdown parser that transforms Markdown input into a custom abstract syntax tree (AST)
2//! intented to be rendered with [basalt](https://github.com/erikjuhani/basalt)—a TUI application
3//! for Obsidian.
4//!
5//! This module provides a [`Parser`] type, which processes raw Markdown input into a [`Vec`] of
6//! [`Node`]s. These [`Node`]s represent semantic elements such as headings, paragraphs, block
7//! quotes, and code blocks.
8//!
9//! The parser is built on top of [`pulldown_cmark`].
10//!
11//! ## Simple usage
12//!
13//! At the simplest level, you can parse a Markdown string by calling the [`from_str`] function:
14//!
15//! ```
16//! use basalt_core::markdown::{from_str, Node, HeadingLevel, Text};
17//!
18//! let markdown = "# My Heading\n\nSome text.";
19//! let nodes = from_str(markdown);
20//!
21//! assert_eq!(nodes, vec![
22//! Node::Heading {
23//! level: HeadingLevel::H1,
24//! text: Text::from("My Heading"),
25//! },
26//! Node::Paragraph {
27//! text: Text::from("Some text."),
28//! },
29//! ])
30//! ```
31//!
32//! ## Implementation details
33//!
34//! The [`Parser`] processes [`pulldown_cmark::Event`]s one by one, building up the current
35//! [`Node`] in `current_node`. When an event indicates the start of a new structure (e.g.,
36//! `Event::Start(Tag::Heading {..})`), the [`Parser`] pushes or replaces the current node
37//! with a new one. When an event indicates the end of that structure, the node is finalized
38//! and pushed into [`Parser::output`].
39//!
40//! Unrecognized events (such as [`InlineHtml`](pulldown_cmark::Event::InlineHtml)) are simply
41//! ignored for the time being.
42//!
43//! ## Not yet implemented
44//!
45//! - Handling of inline HTML, math blocks, etc.
46//! - Tracking code block language (`lang`) properly (currently set to [`None`]).
47use std::vec::IntoIter;
48
49use pulldown_cmark::{Event, Options, Tag, TagEnd};
50
51/// A style that can be applied to [`TextNode`] (code, emphasis, strikethrough, strong).
52#[derive(Clone, Debug, PartialEq)]
53pub enum Style {
54 /// Inline code style (e.g. `code`).
55 Code,
56 /// Italic/emphasis style (e.g. `*emphasis*`).
57 Emphasis,
58 /// Strikethrough style (e.g. `~~strikethrough~~`).
59 Strikethrough,
60 /// Bold/strong style (e.g. `**strong**`).
61 Strong,
62}
63
64/// Represents the variant of a list or task item (checked, unchecked, etc.).
65#[derive(Clone, Debug, PartialEq)]
66pub enum ItemKind {
67 /// A checkbox item that is marked as done using `- [x]`.
68 HardChecked,
69 /// A checkbox item that is checked, but not explicitly recognized as
70 /// `HardChecked` (e.g., `- [?]`).
71 Checked,
72 /// A checkbox item that is unchecked using `- [ ]`.
73 Unchecked,
74 // TODO: Remove in favor of using List node that has children of nodes
75 /// An ordered list item (e.g., `1. item`), storing the numeric index.
76 Ordered(u64),
77 /// An unordered list item (e.g., `- item`).
78 Unordered,
79}
80
81#[derive(Clone, Debug, PartialEq)]
82#[allow(missing_docs)]
83pub enum HeadingLevel {
84 H1 = 1,
85 H2,
86 H3,
87 H4,
88 H5,
89 H6,
90}
91
92impl From<pulldown_cmark::HeadingLevel> for HeadingLevel {
93 fn from(value: pulldown_cmark::HeadingLevel) -> Self {
94 match value {
95 pulldown_cmark::HeadingLevel::H1 => HeadingLevel::H1,
96 pulldown_cmark::HeadingLevel::H2 => HeadingLevel::H2,
97 pulldown_cmark::HeadingLevel::H3 => HeadingLevel::H3,
98 pulldown_cmark::HeadingLevel::H4 => HeadingLevel::H4,
99 pulldown_cmark::HeadingLevel::H5 => HeadingLevel::H5,
100 pulldown_cmark::HeadingLevel::H6 => HeadingLevel::H6,
101 }
102 }
103}
104
105/// Represents specialized block quote kind variants (tip, note, warning, etc.).
106///
107/// Currently, the underlying [`pulldown_cmark`] parser distinguishes these via syntax like `">
108/// [!NOTE] Some note"`.
109#[derive(Clone, Debug, PartialEq)]
110#[allow(missing_docs)]
111pub enum BlockQuoteKind {
112 Note,
113 Tip,
114 Important,
115 Warning,
116 Caution,
117}
118
119impl From<pulldown_cmark::BlockQuoteKind> for BlockQuoteKind {
120 fn from(value: pulldown_cmark::BlockQuoteKind) -> Self {
121 match value {
122 pulldown_cmark::BlockQuoteKind::Tip => BlockQuoteKind::Tip,
123 pulldown_cmark::BlockQuoteKind::Note => BlockQuoteKind::Note,
124 pulldown_cmark::BlockQuoteKind::Warning => BlockQuoteKind::Warning,
125 pulldown_cmark::BlockQuoteKind::Caution => BlockQuoteKind::Caution,
126 pulldown_cmark::BlockQuoteKind::Important => BlockQuoteKind::Important,
127 }
128 }
129}
130
131/// Denotes whether a list is ordered or unordered.
132#[derive(Clone, Debug, PartialEq)]
133pub enum ListKind {
134 /// An ordered list item (e.g., `1. item`), storing the numeric index.
135 Ordered(u64),
136 /// An unordered list item (e.g., `- item`).
137 Unordered,
138}
139
140/// A single unit of text that is optionally styled (e.g., code).
141///
142/// [`TextNode`] can be any combination of sentence, words or characters.
143///
144/// Usually styled text will be contained in a single [`TextNode`] with the given [`Style`]
145/// property.
146#[derive(Clone, Debug, PartialEq, Default)]
147pub struct TextNode {
148 /// The literal text content.
149 pub content: String,
150 /// Optional inline style of the text.
151 pub style: Option<Style>,
152}
153
154impl From<&str> for TextNode {
155 fn from(value: &str) -> Self {
156 value.to_string().into()
157 }
158}
159
160impl From<String> for TextNode {
161 fn from(value: String) -> Self {
162 Self {
163 content: value,
164 ..Default::default()
165 }
166 }
167}
168
169impl TextNode {
170 /// Creates a new [`TextNode`] from `content` and optional [`Style`].
171 pub fn new(content: String, style: Option<Style>) -> Self {
172 Self { content, style }
173 }
174}
175
176/// A wrapper type holding a list of [`TextNode`]s.
177#[derive(Clone, Debug, PartialEq, Default)]
178pub struct Text(Vec<TextNode>);
179
180impl From<&str> for Text {
181 fn from(value: &str) -> Self {
182 TextNode::from(value).into()
183 }
184}
185
186impl From<String> for Text {
187 fn from(value: String) -> Self {
188 TextNode::from(value).into()
189 }
190}
191
192impl From<TextNode> for Text {
193 fn from(value: TextNode) -> Self {
194 Self([value].to_vec())
195 }
196}
197
198impl From<Vec<TextNode>> for Text {
199 fn from(value: Vec<TextNode>) -> Self {
200 Self(value)
201 }
202}
203
204impl From<&[TextNode]> for Text {
205 fn from(value: &[TextNode]) -> Self {
206 Self(value.to_vec())
207 }
208}
209
210impl IntoIterator for Text {
211 type Item = TextNode;
212 type IntoIter = IntoIter<Self::Item>;
213 fn into_iter(self) -> Self::IntoIter {
214 self.0.into_iter()
215 }
216}
217
218impl Text {
219 /// Appends a [`TextNode`] to the inner text list.
220 fn push(&mut self, node: TextNode) {
221 self.0.push(node);
222 }
223}
224
225/// The Markdown AST node enumeration.
226#[derive(Clone, Debug, PartialEq)]
227#[allow(missing_docs)]
228pub enum Node {
229 /// A heading node that represents different heading levels.
230 ///
231 /// The level is controlled with the [`HeadingLevel`] definition.
232 Heading {
233 level: HeadingLevel,
234 text: Text,
235 },
236 Paragraph {
237 text: Text,
238 },
239 /// A block quote node that represents different quote block variants including callout blocks.
240 ///
241 /// The variant is controlled with the [`BlockQuoteKind`] definition. When [`BlockQuoteKind`]
242 /// is [`None`] the block quote should be interpreted as a regular block quote:
243 /// `"> Block quote"`.
244 BlockQuote {
245 kind: Option<BlockQuoteKind>,
246 nodes: Vec<Node>,
247 },
248 /// A fenced code block, optionally with a language identifier.
249 CodeBlock {
250 lang: Option<String>,
251 text: Text,
252 },
253 /// A list item node that represents different list item variants including task items.
254 ///
255 /// The variant is controlled with the [`ItemKind`] definition. When [`ItemKind`] is [`None`]
256 /// the item should be interpreted as unordered list item: `"- Item"`.
257 Item {
258 kind: Option<ItemKind>,
259 text: Text,
260 },
261}
262
263impl Node {
264 /// Pushes a [`TextNode`] into this node, if it contains a text buffer.
265 ///
266 /// If the node is a [`BlockQuote`], the [`TextNode`] will be pushed into the last child
267 /// [`Node`], if any.
268 /// ```
269 pub(crate) fn push_text_node(&mut self, node: TextNode) {
270 match self {
271 Node::Paragraph { text, .. }
272 | Node::Heading { text, .. }
273 | Node::CodeBlock { text, .. }
274 | Node::Item { text, .. } => text.push(node),
275 Node::BlockQuote { nodes, .. } => {
276 if let Some(last_node) = nodes.last_mut() {
277 last_node.push_text_node(node);
278 }
279 }
280 }
281 }
282}
283
284/// Returns `true` if the [`Node`] should be closed upon encountering the given [`TagEnd`].
285fn matches_tag_end(node: &Node, tag_end: &TagEnd) -> bool {
286 match (node, tag_end) {
287 (Node::Paragraph { .. }, TagEnd::Paragraph)
288 | (Node::Heading { .. }, TagEnd::Heading(..))
289 | (Node::BlockQuote { .. }, TagEnd::BlockQuote(..))
290 | (Node::CodeBlock { .. }, TagEnd::CodeBlock)
291 | (Node::Item { .. }, TagEnd::Item) => true,
292 _ => false,
293 }
294}
295
296/// Parses the given Markdown input into a list of [`Node`]s.
297///
298/// This is a convenience function for constructing a [`Parser`] and calling [`Parser::parse`].
299///
300/// # Examples
301///
302/// ```
303/// use basalt_core::markdown::{from_str, Node, HeadingLevel, Text};
304///
305/// let markdown = "# My Heading\n\nSome text.";
306/// let nodes = from_str(markdown);
307///
308/// assert_eq!(nodes, vec![
309/// Node::Heading {
310/// level: HeadingLevel::H1,
311/// text: Text::from("My Heading"),
312/// },
313/// Node::Paragraph {
314/// text: Text::from("Some text."),
315/// },
316/// ])
317/// ```
318pub fn from_str<'a>(text: &'a str) -> Vec<Node> {
319 Parser::new(text).parse()
320}
321
322/// A parser that consumes [`pulldown_cmark::Event`]s and produces a [`Vec`] of [`Node`].
323///
324/// # Examples
325///
326/// ```
327/// use basalt_core::markdown::{Parser, Node, HeadingLevel, Text};
328///
329/// let markdown = "# My Heading\n\nSome text.";
330/// let parser = Parser::new(markdown);
331/// let nodes = parser.parse();
332///
333/// assert_eq!(nodes, vec![
334/// Node::Heading {
335/// level: HeadingLevel::H1,
336/// text: Text::from("My Heading"),
337/// },
338/// Node::Paragraph {
339/// text: Text::from("Some text."),
340/// },
341/// ])
342/// ```
343pub struct Parser<'a> {
344 /// Contains the completed AST [`Node`]s.
345 pub output: Vec<Node>,
346 inner: pulldown_cmark::TextMergeStream<'a, pulldown_cmark::Parser<'a>>,
347 current_node: Option<Node>,
348}
349
350impl<'a> Iterator for Parser<'a> {
351 type Item = Event<'a>;
352 fn next(&mut self) -> Option<Self::Item> {
353 self.inner.next()
354 }
355}
356
357impl<'a> Parser<'a> {
358 /// Creates a new [`Parser`] from a Markdown input string.
359 ///
360 /// The parser uses [`pulldown_cmark::Parser::new_ext`] with [`Options::all()`] and
361 /// [`pulldown_cmark::TextMergeStream`] internally.
362 pub fn new(text: &'a str) -> Self {
363 let parser = pulldown_cmark::TextMergeStream::new(pulldown_cmark::Parser::new_ext(
364 text,
365 Options::all(),
366 ));
367
368 Self {
369 inner: parser,
370 output: vec![],
371 current_node: None,
372 }
373 }
374
375 /// Pushes a [`Node`] as a child if the current node is a [`BlockQuote`], otherwise sets it as
376 /// the `current_node`.
377 fn push_node(&mut self, node: Node) {
378 if let Some(Node::BlockQuote { nodes, .. }) = &mut self.current_node {
379 nodes.push(node);
380 } else {
381 self.set_node(&node);
382 }
383 }
384
385 /// Pushes a [`TextNode`] into the `current_node` if it exists.
386 fn push_text_node(&mut self, node: TextNode) {
387 if let Some(ref mut current) = self.current_node {
388 current.push_text_node(node);
389 }
390 }
391
392 /// Sets (or replaces) the `current_node` with a new one, discarding any old node.
393 fn set_node(&mut self, block: &Node) {
394 self.current_node.replace(block.clone());
395 }
396
397 /// Handles the start of a [`Tag`]. Pushes the matching semantic node to be processed.
398 fn tag(&mut self, tag: Tag<'a>) {
399 match tag {
400 Tag::Paragraph => self.push_node(Node::Paragraph {
401 text: Text::default(),
402 }),
403 Tag::Heading { level, .. } => self.push_node(Node::Heading {
404 level: level.into(),
405 text: Text::default(),
406 }),
407 Tag::BlockQuote(kind) => self.push_node(Node::BlockQuote {
408 kind: kind.map(|kind| kind.into()),
409 nodes: vec![],
410 }),
411 Tag::CodeBlock(_) => self.push_node(Node::CodeBlock {
412 lang: None,
413 text: Text::default(),
414 }),
415 Tag::Item => self.push_node(Node::Item {
416 kind: None,
417 text: Text::default(),
418 }),
419 // For now everything below this comment are defined as paragraph nodes
420 Tag::HtmlBlock
421 | Tag::List(_)
422 | Tag::FootnoteDefinition(_)
423 | Tag::Table(_)
424 | Tag::TableHead
425 | Tag::TableRow
426 | Tag::TableCell
427 | Tag::Emphasis
428 | Tag::Strong
429 | Tag::Strikethrough
430 | Tag::Link { .. }
431 | Tag::Image { .. }
432 | Tag::MetadataBlock(_)
433 | Tag::DefinitionList
434 | Tag::DefinitionListTitle
435 | Tag::DefinitionListDefinition => {}
436 }
437 }
438
439 /// Handles the end of a [`Tag`], finalizing a node if matching.
440 fn tag_end(&mut self, tag_end: TagEnd) {
441 let Some(node) = self.current_node.take() else {
442 return;
443 };
444
445 if matches_tag_end(&node, &tag_end) {
446 self.output.push(node);
447 } else {
448 self.set_node(&node);
449 }
450 }
451
452 /// Processes a single [`Event`] from the underlying [`pulldown_cmark::Parser`] iterator.
453 fn handle_event(&mut self, event: Event<'a>) {
454 match event {
455 Event::Start(tag) => self.tag(tag),
456 Event::End(tag_end) => self.tag_end(tag_end),
457 Event::Text(text) => self.push_text_node(TextNode::new(text.to_string(), None)),
458 Event::Code(text) => {
459 self.push_text_node(TextNode::new(text.to_string(), Some(Style::Code)))
460 }
461 Event::TaskListMarker(checked) => {
462 if checked {
463 self.set_node(&Node::Item {
464 kind: Some(ItemKind::HardChecked),
465 text: Text::default(),
466 });
467 } else {
468 self.set_node(&Node::Item {
469 kind: Some(ItemKind::Unchecked),
470 text: Text::default(),
471 });
472 }
473 }
474 Event::InlineMath(_)
475 | Event::DisplayMath(_)
476 | Event::Html(_)
477 | Event::InlineHtml(_)
478 | Event::SoftBreak
479 | Event::HardBreak
480 | Event::Rule
481 | Event::FootnoteReference(_) => {
482 // TODO: Not yet implemented
483 }
484 }
485 }
486
487 /// Consumes the parser, processing all remaining events from the stream into a list of
488 /// [`Node`]s.
489 ///
490 /// # Examples
491 ///
492 /// ```
493 /// # use basalt_core::markdown::{Parser, Node, Text};
494 /// let parser = Parser::new("Hello world");
495 ///
496 /// let nodes = parser.parse();
497 ///
498 /// assert_eq!(nodes, vec![Node::Paragraph { text: Text::from("Hello world") }]);
499 /// ```
500 pub fn parse(mut self) -> Vec<Node> {
501 while let Some(event) = self.next() {
502 self.handle_event(event);
503 }
504
505 if let Some(node) = self.current_node.take() {
506 self.output.push(node);
507 }
508
509 self.output
510 }
511}
512
513#[cfg(test)]
514mod tests {
515 use indoc::indoc;
516
517 fn p(str: &str) -> Node {
518 Node::Paragraph { text: str.into() }
519 }
520
521 fn blockquote(nodes: Vec<Node>) -> Node {
522 Node::BlockQuote { kind: None, nodes }
523 }
524
525 fn item(str: &str) -> Node {
526 Node::Item {
527 kind: None,
528 text: str.into(),
529 }
530 }
531
532 fn task(str: &str) -> Node {
533 Node::Item {
534 kind: Some(ItemKind::Unchecked),
535 text: str.into(),
536 }
537 }
538
539 fn completed_task(str: &str) -> Node {
540 Node::Item {
541 kind: Some(ItemKind::HardChecked),
542 text: str.into(),
543 }
544 }
545
546 fn heading(level: HeadingLevel, str: &str) -> Node {
547 Node::Heading {
548 level,
549 text: str.into(),
550 }
551 }
552
553 fn h1(str: &str) -> Node {
554 heading(HeadingLevel::H1, str)
555 }
556
557 fn h2(str: &str) -> Node {
558 heading(HeadingLevel::H2, str)
559 }
560
561 fn h3(str: &str) -> Node {
562 heading(HeadingLevel::H3, str)
563 }
564
565 fn h4(str: &str) -> Node {
566 heading(HeadingLevel::H4, str)
567 }
568
569 fn h5(str: &str) -> Node {
570 heading(HeadingLevel::H5, str)
571 }
572
573 fn h6(str: &str) -> Node {
574 heading(HeadingLevel::H6, str)
575 }
576
577 use super::*;
578
579 #[test]
580 fn test_parse() {
581 let tests = [
582 (
583 indoc! {r#"# Heading 1
584
585 ## Heading 2
586
587 ### Heading 3
588
589 #### Heading 4
590
591 ##### Heading 5
592
593 ###### Heading 6
594 "#},
595 vec![
596 h1("Heading 1"),
597 h2("Heading 2"),
598 h3("Heading 3"),
599 h4("Heading 4"),
600 h5("Heading 5"),
601 h6("Heading 6"),
602 ],
603 ),
604 // TODO: Implement correct test case when `- [?] ` task item syntax is supported
605 // Now we interpret it as a regular paragraph
606 (
607 indoc! { r#"## Tasks
608
609 - [ ] Task
610
611 - [x] Completed task
612
613 - [?] Completed task
614 "#},
615 vec![
616 h2("Tasks"),
617 task("Task"),
618 completed_task("Completed task"),
619 p("[?] Completed task"),
620 ],
621 ),
622 (
623 indoc! {r#"## Quotes
624
625 You _can_ quote text by adding a `>` symbols before the text.
626
627 > Human beings face ever more complex and urgent problems, and their effectiveness in dealing with these problems is a matter that is critical to the stability and continued progress of society.
628 >
629 >- Doug Engelbart, 1961
630 "#},
631 vec![
632 h2("Quotes"),
633 Node::Paragraph {
634 text: vec![
635 TextNode::new("You ".into(), None),
636 TextNode::new("can".into(), None),
637 TextNode::new(" quote text by adding a ".into(), None),
638 TextNode::new(">".into(), Some(Style::Code)),
639 TextNode::new(" symbols before the text.".into(), None),
640 ]
641 .into(),
642 },
643 blockquote(vec![
644 p("Human beings face ever more complex and urgent problems, and their effectiveness in dealing with these problems is a matter that is critical to the stability and continued progress of society."),
645 item("Doug Engelbart, 1961")
646 ]),
647 ],
648 ),
649 ];
650
651 tests
652 .iter()
653 .for_each(|test| assert_eq!(from_str(test.0), test.1));
654 }
655}