Skip to main content

scrybe_core/
ast.rs

1// SPDX-License-Identifier: Apache-2.0
2// Copyright 2026 Shawn Hartsock and contributors
3
4//! Markdown AST type definitions for Scrybe.
5//!
6//! The [`Ast`] type is built from pulldown-cmark events and represents
7//! the document structure as a tree of [`Node`]s. Rendering lives in
8//! `scrybe-render` (P1.3).
9
10use pulldown_cmark::{Event, HeadingLevel, Options, Parser, Tag, TagEnd};
11
12/// A single node in the Markdown AST.
13#[derive(Debug, Clone, PartialEq)]
14pub enum Node {
15    /// A heading with a numeric level (1–6) and inline children.
16    Heading { level: u8, children: Vec<Node> },
17    /// A block of inline content.
18    Paragraph { children: Vec<Node> },
19    /// A fenced code block with an optional language tag.
20    FencedCode { lang: String, content: String },
21    /// Inline code span.
22    InlineCode { content: String },
23    /// A block quote containing block children.
24    BlockQuote { children: Vec<Node> },
25    /// An ordered or unordered list.
26    List { ordered: bool, items: Vec<Node> },
27    /// A list item containing block children.
28    ListItem { children: Vec<Node> },
29    /// Emphasised (italic) inline content.
30    Emphasis { children: Vec<Node> },
31    /// Strong (bold) inline content.
32    Strong { children: Vec<Node> },
33    /// A hyperlink.
34    Link {
35        href: String,
36        title: String,
37        children: Vec<Node>,
38    },
39    /// An image.
40    Image { src: String, alt: String },
41    /// A thematic break (`---` / `***`).
42    HorizontalRule,
43    /// A hard line break (`\\\n`).
44    HardBreak,
45    /// A soft line break (single newline in source).
46    SoftBreak,
47    /// Plain text.
48    Text(String),
49    /// Raw HTML.
50    Html(String),
51}
52
53// ---------------------------------------------------------------------------
54// Stack frame used while building the AST
55// ---------------------------------------------------------------------------
56
57#[derive(Debug)]
58enum Frame {
59    Root,
60    Heading { level: u8 },
61    Paragraph,
62    BlockQuote,
63    List { ordered: bool },
64    ListItem,
65    Emphasis,
66    Strong,
67    Link { href: String, title: String },
68    Image { src: String, alt: String },
69    FencedCode { lang: String },
70}
71
72/// A parsed Markdown document as a tree of [`Node`]s.
73#[derive(Debug, Clone, PartialEq)]
74pub struct Ast {
75    /// Top-level nodes of the document.
76    pub nodes: Vec<Node>,
77}
78
79impl Ast {
80    /// Parses Markdown *source* into an AST.
81    pub fn parse(source: &str) -> Self {
82        let opts =
83            Options::ENABLE_STRIKETHROUGH | Options::ENABLE_TABLES | Options::ENABLE_FOOTNOTES;
84        let parser = Parser::new_ext(source, opts);
85
86        // Each stack entry is (frame, children-accumulated-so-far).
87        let mut stack: Vec<(Frame, Vec<Node>)> = vec![(Frame::Root, Vec::new())];
88
89        // Accumulates text inside a FencedCode block before we close it.
90        let mut code_buf = String::new();
91
92        for event in parser {
93            match event {
94                // --- Opening tags ---
95                Event::Start(tag) => match tag {
96                    Tag::Heading { level, .. } => {
97                        stack.push((
98                            Frame::Heading {
99                                level: heading_level(level),
100                            },
101                            Vec::new(),
102                        ));
103                    }
104                    Tag::Paragraph => {
105                        stack.push((Frame::Paragraph, Vec::new()));
106                    }
107                    Tag::BlockQuote(_) => {
108                        stack.push((Frame::BlockQuote, Vec::new()));
109                    }
110                    Tag::List(start) => {
111                        stack.push((
112                            Frame::List {
113                                ordered: start.is_some(),
114                            },
115                            Vec::new(),
116                        ));
117                    }
118                    Tag::Item => {
119                        stack.push((Frame::ListItem, Vec::new()));
120                    }
121                    Tag::Emphasis => {
122                        stack.push((Frame::Emphasis, Vec::new()));
123                    }
124                    Tag::Strong => {
125                        stack.push((Frame::Strong, Vec::new()));
126                    }
127                    Tag::Link {
128                        dest_url, title, ..
129                    } => {
130                        stack.push((
131                            Frame::Link {
132                                href: dest_url.to_string(),
133                                title: title.to_string(),
134                            },
135                            Vec::new(),
136                        ));
137                    }
138                    Tag::Image {
139                        dest_url, title, ..
140                    } => {
141                        // pulldown-cmark puts alt text as Text events before End(Image).
142                        // We collect them as the alt string.
143                        stack.push((
144                            Frame::Image {
145                                src: dest_url.to_string(),
146                                alt: title.to_string(),
147                            },
148                            Vec::new(),
149                        ));
150                    }
151                    Tag::CodeBlock(kind) => {
152                        let lang = match kind {
153                            pulldown_cmark::CodeBlockKind::Fenced(s) => s.to_string(),
154                            pulldown_cmark::CodeBlockKind::Indented => String::new(),
155                        };
156                        code_buf.clear();
157                        stack.push((Frame::FencedCode { lang }, Vec::new()));
158                    }
159                    // Ignore tags we don't model (tables, footnotes, etc.)
160                    _ => {}
161                },
162
163                // --- Closing tags ---
164                Event::End(tag_end) => {
165                    let node = match tag_end {
166                        TagEnd::Heading(_) => {
167                            if let Some((Frame::Heading { level }, children)) = stack.pop() {
168                                Some(Node::Heading { level, children })
169                            } else {
170                                None
171                            }
172                        }
173                        TagEnd::Paragraph => {
174                            if let Some((Frame::Paragraph, children)) = stack.pop() {
175                                Some(Node::Paragraph { children })
176                            } else {
177                                None
178                            }
179                        }
180                        TagEnd::BlockQuote(_) => {
181                            if let Some((Frame::BlockQuote, children)) = stack.pop() {
182                                Some(Node::BlockQuote { children })
183                            } else {
184                                None
185                            }
186                        }
187                        TagEnd::List(_) => {
188                            if let Some((Frame::List { ordered }, items)) = stack.pop() {
189                                Some(Node::List { ordered, items })
190                            } else {
191                                None
192                            }
193                        }
194                        TagEnd::Item => {
195                            if let Some((Frame::ListItem, children)) = stack.pop() {
196                                Some(Node::ListItem { children })
197                            } else {
198                                None
199                            }
200                        }
201                        TagEnd::Emphasis => {
202                            if let Some((Frame::Emphasis, children)) = stack.pop() {
203                                Some(Node::Emphasis { children })
204                            } else {
205                                None
206                            }
207                        }
208                        TagEnd::Strong => {
209                            if let Some((Frame::Strong, children)) = stack.pop() {
210                                Some(Node::Strong { children })
211                            } else {
212                                None
213                            }
214                        }
215                        TagEnd::Link => {
216                            if let Some((Frame::Link { href, title }, children)) = stack.pop() {
217                                Some(Node::Link {
218                                    href,
219                                    title,
220                                    children,
221                                })
222                            } else {
223                                None
224                            }
225                        }
226                        TagEnd::Image => {
227                            if let Some((Frame::Image { src, alt }, _children)) = stack.pop() {
228                                // For images the alt text came through as Text events;
229                                // ignore those children and use the title as alt.
230                                Some(Node::Image { src, alt })
231                            } else {
232                                None
233                            }
234                        }
235                        TagEnd::CodeBlock => {
236                            if let Some((Frame::FencedCode { lang }, _)) = stack.pop() {
237                                let content = std::mem::take(&mut code_buf);
238                                // Strip the trailing newline that pulldown-cmark always adds.
239                                let content = content.trim_end_matches('\n').to_string();
240                                Some(Node::FencedCode { lang, content })
241                            } else {
242                                None
243                            }
244                        }
245                        // Unmodelled end tags — discard
246                        _ => None,
247                    };
248
249                    if let Some(n) = node {
250                        if let Some((_, ref mut parent_children)) = stack.last_mut() {
251                            parent_children.push(n);
252                        }
253                    }
254                }
255
256                // --- Leaf events ---
257                Event::Text(s) => {
258                    // If we're inside a code block, accumulate into code_buf.
259                    if matches!(stack.last(), Some((Frame::FencedCode { .. }, _))) {
260                        code_buf.push_str(&s);
261                    } else if let Some((_, ref mut children)) = stack.last_mut() {
262                        children.push(Node::Text(s.to_string()));
263                    }
264                }
265                Event::Code(s) => {
266                    if let Some((_, ref mut children)) = stack.last_mut() {
267                        children.push(Node::InlineCode {
268                            content: s.to_string(),
269                        });
270                    }
271                }
272                Event::Html(s) | Event::InlineHtml(s) => {
273                    if let Some((_, ref mut children)) = stack.last_mut() {
274                        children.push(Node::Html(s.to_string()));
275                    }
276                }
277                Event::SoftBreak => {
278                    if let Some((_, ref mut children)) = stack.last_mut() {
279                        children.push(Node::SoftBreak);
280                    }
281                }
282                Event::HardBreak => {
283                    if let Some((_, ref mut children)) = stack.last_mut() {
284                        children.push(Node::HardBreak);
285                    }
286                }
287                Event::Rule => {
288                    if let Some((_, ref mut children)) = stack.last_mut() {
289                        children.push(Node::HorizontalRule);
290                    }
291                }
292                // Ignore footnote references, task list markers, etc.
293                _ => {}
294            }
295        }
296
297        // Drain the root frame.
298        let nodes = match stack.into_iter().next() {
299            Some((Frame::Root, children)) => children,
300            _ => Vec::new(),
301        };
302
303        Self { nodes }
304    }
305
306    /// Returns the title: text content of the first H1 node, if any.
307    pub fn title(&self) -> Option<String> {
308        for node in &self.nodes {
309            if let Node::Heading { level: 1, children } = node {
310                let text = collect_text(children);
311                if !text.is_empty() {
312                    return Some(text);
313                }
314            }
315        }
316        None
317    }
318}
319
320// ---------------------------------------------------------------------------
321// Helpers
322// ---------------------------------------------------------------------------
323
324fn heading_level(level: HeadingLevel) -> u8 {
325    match level {
326        HeadingLevel::H1 => 1,
327        HeadingLevel::H2 => 2,
328        HeadingLevel::H3 => 3,
329        HeadingLevel::H4 => 4,
330        HeadingLevel::H5 => 5,
331        HeadingLevel::H6 => 6,
332    }
333}
334
335/// Recursively collect plain text from a node list.
336fn collect_text(nodes: &[Node]) -> String {
337    let mut out = String::new();
338    for node in nodes {
339        match node {
340            Node::Text(s) => out.push_str(s),
341            Node::InlineCode { content } => out.push_str(content),
342            Node::Emphasis { children }
343            | Node::Strong { children }
344            | Node::Link { children, .. } => {
345                out.push_str(&collect_text(children));
346            }
347            _ => {}
348        }
349    }
350    out
351}
352
353// ---------------------------------------------------------------------------
354// Tests
355// ---------------------------------------------------------------------------
356
357#[cfg(test)]
358mod tests {
359    use super::*;
360
361    #[test]
362    fn test_heading_parsing() {
363        let ast = Ast::parse("# Hello\n\n## World\n");
364        assert_eq!(ast.nodes.len(), 2);
365        assert!(matches!(&ast.nodes[0], Node::Heading { level: 1, .. }));
366        assert!(matches!(&ast.nodes[1], Node::Heading { level: 2, .. }));
367    }
368
369    #[test]
370    fn test_heading_text_children() {
371        let ast = Ast::parse("# My Title\n");
372        if let Node::Heading { level, children } = &ast.nodes[0] {
373            assert_eq!(*level, 1);
374            assert!(matches!(&children[0], Node::Text(s) if s == "My Title"));
375        } else {
376            panic!("expected Heading");
377        }
378    }
379
380    #[test]
381    fn test_title_from_h1() {
382        let ast = Ast::parse("# The Title\n\nSome paragraph.\n");
383        assert_eq!(ast.title(), Some("The Title".to_string()));
384    }
385
386    #[test]
387    fn test_title_none_when_no_h1() {
388        let ast = Ast::parse("## Subheading only\n");
389        assert_eq!(ast.title(), None);
390    }
391
392    #[test]
393    fn test_fenced_code() {
394        let src = "```rust\nfn main() {}\n```\n";
395        let ast = Ast::parse(src);
396        assert_eq!(ast.nodes.len(), 1);
397        if let Node::FencedCode { lang, content } = &ast.nodes[0] {
398            assert_eq!(lang, "rust");
399            assert_eq!(content, "fn main() {}");
400        } else {
401            panic!("expected FencedCode, got {:?}", ast.nodes[0]);
402        }
403    }
404
405    #[test]
406    fn test_fenced_code_no_lang() {
407        let src = "```\nhello\n```\n";
408        let ast = Ast::parse(src);
409        if let Node::FencedCode { lang, content } = &ast.nodes[0] {
410            assert_eq!(lang, "");
411            assert_eq!(content, "hello");
412        } else {
413            panic!("expected FencedCode");
414        }
415    }
416
417    #[test]
418    fn test_paragraph_and_inline_code() {
419        let src = "Use `cargo test` now.\n";
420        let ast = Ast::parse(src);
421        assert_eq!(ast.nodes.len(), 1);
422        if let Node::Paragraph { children } = &ast.nodes[0] {
423            let has_inline = children
424                .iter()
425                .any(|n| matches!(n, Node::InlineCode { content } if content == "cargo test"));
426            assert!(has_inline);
427        } else {
428            panic!("expected Paragraph");
429        }
430    }
431
432    #[test]
433    fn test_nested_list() {
434        let src = "- alpha\n- beta\n";
435        let ast = Ast::parse(src);
436        assert_eq!(ast.nodes.len(), 1);
437        if let Node::List { ordered, items } = &ast.nodes[0] {
438            assert!(!ordered);
439            assert_eq!(items.len(), 2);
440            for item in items {
441                assert!(matches!(item, Node::ListItem { .. }));
442            }
443        } else {
444            panic!("expected List");
445        }
446    }
447
448    #[test]
449    fn test_ordered_list() {
450        let src = "1. one\n2. two\n";
451        let ast = Ast::parse(src);
452        if let Node::List { ordered, .. } = &ast.nodes[0] {
453            assert!(ordered);
454        } else {
455            panic!("expected List");
456        }
457    }
458
459    #[test]
460    fn test_blockquote() {
461        let src = "> a quote\n";
462        let ast = Ast::parse(src);
463        assert!(matches!(&ast.nodes[0], Node::BlockQuote { .. }));
464    }
465
466    #[test]
467    fn test_horizontal_rule() {
468        let src = "---\n";
469        let ast = Ast::parse(src);
470        assert!(matches!(&ast.nodes[0], Node::HorizontalRule));
471    }
472}