Skip to main content

marco_core/parser/
ast.rs

1// AST node definitions: central representation consumed by renderer and intelligence
2
3use crate::parser::Span;
4use std::collections::HashMap;
5
6// Link reference map: stores [label]: url definitions for later resolution
7#[derive(Debug, Clone, Default)]
8pub struct ReferenceMap {
9    // Key: normalized label (case-folded, whitespace collapsed), Value: (url, optional title)
10    defs: HashMap<String, (String, Option<String>)>,
11}
12
13impl ReferenceMap {
14    pub fn new() -> Self {
15        Self::default()
16    }
17
18    /// Add a link reference definition
19    pub fn insert(&mut self, label: &str, url: String, title: Option<String>) {
20        let normalized = normalize_label(label);
21        // CommonMark: when multiple definitions normalize to the same label,
22        // the first definition takes precedence.
23        self.defs.entry(normalized).or_insert((url, title));
24    }
25
26    /// Lookup a link reference by label
27    pub fn get(&self, label: &str) -> Option<&(String, Option<String>)> {
28        let normalized = normalize_label(label);
29        self.defs.get(&normalized)
30    }
31
32    /// Check if a label exists
33    pub fn contains(&self, label: &str) -> bool {
34        let normalized = normalize_label(label);
35        self.defs.contains_key(&normalized)
36    }
37}
38
39/// Normalize label according to CommonMark spec:
40/// - Apply Unicode case-folding semantics (best-effort)
41/// - Collapse consecutive whitespace to single space
42/// - Trim leading/trailing whitespace
43fn normalize_label(label: &str) -> String {
44    let collapsed = label.split_whitespace().collect::<Vec<_>>().join(" ");
45
46    // NOTE:
47    // Rust doesn't provide full Unicode case-folding in std. We apply
48    // to_lowercase() plus the critical sharp-s expansion so labels like
49    // "แบž" and "SS" normalize identically, matching CommonMark examples.
50    let mut out = String::with_capacity(collapsed.len());
51    for ch in collapsed.chars() {
52        for lower in ch.to_lowercase() {
53            if lower == 'รŸ' {
54                out.push('s');
55                out.push('s');
56            } else {
57                out.push(lower);
58            }
59        }
60    }
61
62    out
63}
64
65// Root document node
66#[derive(Debug, Clone, Default)]
67pub struct Document {
68    pub children: Vec<Node>,
69    pub references: ReferenceMap,
70}
71
72// Generic AST node
73#[derive(Debug, Clone)]
74pub struct Node {
75    pub kind: NodeKind,
76    pub span: Option<Span>,
77    pub children: Vec<Node>,
78}
79
80/// Table column alignment (GFM tables extension).
81#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
82pub enum TableAlignment {
83    #[default]
84    None,
85    Left,
86    Center,
87    Right,
88}
89
90/// GitHub-style admonitions / alerts (GFM extension).
91///
92/// Syntax is based on blockquotes, e.g.
93///
94/// `> [!NOTE]`
95/// `> body...`
96#[derive(Debug, Clone, Copy, PartialEq, Eq)]
97pub enum AdmonitionKind {
98    Note,
99    Tip,
100    Important,
101    Warning,
102    Caution,
103}
104
105/// Rendering style for admonitions.
106///
107/// - `Alert`: Standard GitHub-style alert coloring (NOTE/TIP/WARNING/etc).
108/// - `Quote`: Quote-colored styling (neutral border/colors like regular blockquotes) while
109///   keeping the admonition title layout.
110#[derive(Debug, Clone, Copy, PartialEq, Eq)]
111pub enum AdmonitionStyle {
112    Alert,
113    Quote,
114}
115
116// All node types
117#[derive(Debug, Clone)]
118pub enum NodeKind {
119    // Block-level
120    Heading {
121        level: u8,
122        text: String,
123        /// Explicit heading id, e.g. `### Title {#custom-id}`.
124        ///
125        /// When present, the renderer should emit it as `id="..."` on the
126        /// heading element.
127        id: Option<String>,
128    },
129    Paragraph,
130    CodeBlock {
131        language: Option<String>,
132        code: String,
133    },
134    ThematicBreak, // Horizontal rule (---, ***, ___)
135    List {
136        ordered: bool,
137        start: Option<u32>, // Starting number for ordered lists
138        tight: bool,        // No blank lines between items
139    },
140    ListItem,
141
142    /// Extended definition lists (Markdown Guide / Markdown Extra-style).
143    ///
144    /// Rendering convention:
145    /// - A `DefinitionList` contains alternating `DefinitionTerm` (`<dt>`) and
146    ///   `DefinitionDescription` (`<dd>`) children.
147    /// - `DefinitionTerm` should contain inline children.
148    /// - `DefinitionDescription` should contain block children.
149    DefinitionList,
150    DefinitionTerm,
151    DefinitionDescription,
152    /// GFM task list checkbox marker for a list item.
153    ///
154    /// This is emitted by the list parser when a list item begins with
155    /// `[ ]` or `[x]` / `[X]`.
156    ///
157    /// Rendering convention:
158    /// - This node is expected to appear as the first child inside a `ListItem`.
159    /// - The HTML renderer will convert it into a themed checkbox icon.
160    TaskCheckbox {
161        checked: bool,
162    },
163    Blockquote,
164    /// GitHub-style admonition / alert (GFM extension).
165    ///
166    /// This is created by a post-parse transformation that recognizes a special
167    /// first line inside a blockquote (e.g. `[!NOTE]`) and removes that marker.
168    Admonition {
169        kind: AdmonitionKind,
170        /// Optional custom title for the admonition header.
171        ///
172        /// Used by extended GFM-style admonitions (e.g. `> [๐Ÿ˜‚ Happy Header]`).
173        title: Option<String>,
174        /// Optional custom icon content (typically a Unicode emoji) for the title.
175        ///
176        /// Rendered as text (not SVG) and must be styled by CSS.
177        icon: Option<String>,
178        /// Render variant.
179        style: AdmonitionStyle,
180    },
181
182    /// Marco extended tab blocks.
183    ///
184    /// Syntax (container + items):
185    /// ```text
186    /// :::tab
187    /// @tab Title
188    /// Content...
189    /// :::
190    /// ```
191    ///
192    /// Children convention:
193    /// - A `TabGroup` contains one or more `TabItem` children.
194    /// - Each `TabItem` contains block children representing the tab panel content.
195    TabGroup,
196    TabItem {
197        title: String,
198    },
199
200    /// Marco sliders (planned Reveal.js-like syntax, rendered as a simple slideshow).
201    ///
202    /// Syntax:
203    /// ```text
204    /// @slidestart
205    /// slide 1
206    /// ---
207    /// slide 2
208    /// @slideend
209    /// ```
210    ///
211    /// Optional timer (seconds per slide): `@slidestart:t5`.
212    ///
213    /// Children convention:
214    /// - A `SliderDeck` contains one or more `Slide` children.
215    /// - Each `Slide` contains block children representing the slide content.
216    SliderDeck {
217        timer_seconds: Option<u32>,
218    },
219    Slide {
220        /// True if this slide started after a vertical separator (`--`).
221        ///
222        /// The current Marco viewer treats slides as a single linear sequence
223        /// (left/right). This flag is preserved for future vertical navigation.
224        vertical: bool,
225    },
226    /// GFM table (pipe table extension).
227    ///
228    /// Children convention:
229    /// - Each child is a `TableRow`.
230    /// - Each `TableRow` contains `TableCell` children.
231    Table {
232        alignments: Vec<TableAlignment>,
233    },
234    TableRow {
235        header: bool,
236    },
237    TableCell {
238        header: bool,
239        alignment: TableAlignment,
240    },
241    HtmlBlock {
242        html: String,
243    }, // Block-level HTML (comments, tags, etc.)
244
245    /// GFM-style footnote definition (extension).
246    ///
247    /// Syntax:
248    /// - `[^label]: definition text`
249    /// - Continuation lines may be indented.
250    ///
251    /// Rendering convention:
252    /// - This node should not be rendered in place.
253    /// - Instead, the renderer collects referenced footnotes and emits a
254    ///   footnotes section at the end of the document.
255    FootnoteDefinition {
256        label: String,
257    },
258
259    // Inline-level
260    Text(String),
261    /// Inline task checkbox marker (extension).
262    ///
263    /// This is emitted when a paragraph begins with a task marker like
264    /// `[ ] ` / `[x] ` / `[X] `.
265    ///
266    /// Rendering convention:
267    /// - The HTML renderer converts it into the same themed SVG checkbox icon
268    ///   used for task list items.
269    TaskCheckboxInline {
270        checked: bool,
271    },
272    Emphasis,
273    Strong,
274    /// Combined strong+emphasis, e.g. `***text***` or `___text___`.
275    ///
276    /// This is parsed as a single inline node to avoid leaving dangling
277    /// delimiters that would otherwise be treated as plain text.
278    StrongEmphasis,
279    /// Strikethrough (extension), e.g. `~~text~~`.
280    Strikethrough,
281    /// Highlight/mark (extension), e.g. `==text==`.
282    Mark,
283    /// Superscript (extension), e.g. `^text^`.
284    Superscript,
285    /// Subscript (extension), e.g. `~text~`.
286    Subscript,
287    Link {
288        url: String,
289        title: Option<String>,
290    },
291    /// Reference-style link placeholder (CommonMark): `[text][label]`, `[label][]`, `[label]`.
292    ///
293    /// These cannot be fully resolved during inline parsing because reference
294    /// definitions may appear later in the document. The top-level `parse()`
295    /// performs a post-processing pass that converts this into a `Link` when a
296    /// matching definition exists in `Document.references`.
297    ///
298    /// If no matching definition is found, this should be rendered as literal
299    /// bracketed text (preserving the already-parsed `children` for the first
300    /// bracketed segment).
301    LinkReference {
302        /// Label used for reference resolution (will be normalized when looked up).
303        label: String,
304        /// Extra literal suffix after the first `]` (e.g. `"[]"` or `"[label]"`).
305        /// Empty for shortcut reference links.
306        suffix: String,
307    },
308
309    /// GFM-style footnote reference (extension), e.g. `[^label]`.
310    ///
311    /// Rendering convention:
312    /// - If a matching `FootnoteDefinition` exists, this renders as a numbered
313    ///   superscript link.
314    /// - Otherwise it should fall back to literal text.
315    FootnoteReference {
316        label: String,
317    },
318    Image {
319        url: String,
320        alt: String,
321    },
322    CodeSpan(String),
323    InlineHtml(String),
324    HardBreak, // Two spaces + newline, or backslash + newline
325    SoftBreak, // Regular newline (rendered as space in HTML)
326
327    /// Marco extended user mentions.
328    ///
329    /// Syntax:
330    /// - `@username[platform]`
331    /// - `@username[platform](Display Name)`
332    ///
333    /// Rendering policy:
334    /// - The renderer may convert this to an external profile link based on
335    ///   a platform mapping table.
336    PlatformMention {
337        username: String,
338        platform: String,
339        display: Option<String>,
340    },
341    /// Inline math (LaTeX), e.g. `$E = mc^2$`.
342    ///
343    /// Rendering policy:
344    /// - Rendered using KaTeX in inline mode.
345    /// - Content is raw LaTeX source code.
346    InlineMath {
347        content: String,
348    },
349
350    /// Display math (LaTeX), e.g. `$$\int_0^\infty e^{-x^2} dx$$`.
351    ///
352    /// Rendering policy:
353    /// - Rendered using KaTeX in display mode.
354    /// - Content is raw LaTeX source code.
355    DisplayMath {
356        content: String,
357    },
358
359    /// Mermaid diagram (code block with language="mermaid").
360    ///
361    /// Rendering policy:
362    /// - Rendered using mermaid-rs-renderer to SVG.
363    /// - Content is raw Mermaid diagram source code.
364    ///
365    /// This is created during parsing when a fenced code block has
366    /// info string "mermaid".
367    MermaidDiagram {
368        content: String,
369    },
370}
371
372impl Document {
373    pub fn new() -> Self {
374        Self::default()
375    }
376
377    pub fn len(&self) -> usize {
378        self.children.len()
379    }
380
381    pub fn is_empty(&self) -> bool {
382        self.children.is_empty()
383    }
384}
385
386#[cfg(test)]
387mod tests {
388    use super::ReferenceMap;
389
390    #[test]
391    fn smoke_test_reference_map_first_definition_wins() {
392        let mut refs = ReferenceMap::new();
393        refs.insert("foo", "https://first.example".to_string(), None);
394        refs.insert("foo", "https://second.example".to_string(), None);
395
396        let (url, title) = refs.get("foo").expect("reference not found");
397        assert_eq!(url, "https://first.example");
398        assert_eq!(title, &None);
399    }
400
401    #[test]
402    fn smoke_test_reference_map_casefold_sharp_s() {
403        let mut refs = ReferenceMap::new();
404        refs.insert("SS", "/url".to_string(), None);
405
406        let (url, _) = refs.get("แบž").expect("reference not found");
407        assert_eq!(url, "/url");
408    }
409
410    #[test]
411    fn smoke_test_reference_map_whitespace_collapse() {
412        let mut refs = ReferenceMap::new();
413        refs.insert("Foo\n\t  bar", "/url".to_string(), None);
414
415        assert!(refs.contains("foo bar"));
416        assert!(refs.contains("  FOO   BAR  "));
417    }
418}