Skip to main content

marco_core/parser/
ast.rs

1//! AST node definitions consumed by parser, renderer, and intelligence layers.
2
3use crate::parser::Span;
4use std::collections::HashMap;
5
6#[derive(Debug, Clone, Default)]
7/// Map of normalized link reference labels to `(url, optional_title)`.
8pub struct ReferenceMap {
9    // Key: normalized label (case-folded, whitespace collapsed), Value: (url, optional title)
10    defs: HashMap<String, (String, Option<String>)>,
11}
12
13impl ReferenceMap {
14    /// Create an empty reference map.
15    pub fn new() -> Self {
16        Self::default()
17    }
18
19    /// Add a link reference definition
20    pub fn insert(&mut self, label: &str, url: String, title: Option<String>) {
21        let normalized = normalize_label(label);
22        // CommonMark: when multiple definitions normalize to the same label,
23        // the first definition takes precedence.
24        self.defs.entry(normalized).or_insert((url, title));
25    }
26
27    /// Lookup a link reference by label
28    pub fn get(&self, label: &str) -> Option<&(String, Option<String>)> {
29        let normalized = normalize_label(label);
30        self.defs.get(&normalized)
31    }
32
33    /// Check if a label exists
34    pub fn contains(&self, label: &str) -> bool {
35        let normalized = normalize_label(label);
36        self.defs.contains_key(&normalized)
37    }
38}
39
40/// Normalize label according to CommonMark spec:
41/// - Apply Unicode case-folding semantics (best-effort)
42/// - Collapse consecutive whitespace to single space
43/// - Trim leading/trailing whitespace
44fn normalize_label(label: &str) -> String {
45    // Build a whitespace-collapsed string directly without allocating a Vec.
46    let mut collapsed = String::with_capacity(label.len());
47    let mut first = true;
48    for word in label.split_whitespace() {
49        if !first {
50            collapsed.push(' ');
51        }
52        collapsed.push_str(word);
53        first = false;
54    }
55
56    // NOTE:
57    // Rust doesn't provide full Unicode case-folding in std. We apply
58    // to_lowercase() plus the critical sharp-s expansion so labels like
59    // "แบž" and "SS" normalize identically, matching CommonMark examples.
60    let mut out = String::with_capacity(collapsed.len());
61    for ch in collapsed.chars() {
62        for lower in ch.to_lowercase() {
63            if lower == 'รŸ' {
64                out.push('s');
65                out.push('s');
66            } else {
67                out.push(lower);
68            }
69        }
70    }
71
72    out
73}
74
75#[derive(Debug, Clone, Default)]
76/// Root parsed Markdown document.
77pub struct Document {
78    /// Top-level AST children in source order.
79    pub children: Vec<Node>,
80    /// Collected link reference definitions.
81    pub references: ReferenceMap,
82}
83
84#[derive(Debug, Clone)]
85/// Generic AST node.
86pub struct Node {
87    /// Semantic node kind.
88    pub kind: NodeKind,
89    /// Optional source span for this node.
90    pub span: Option<Span>,
91    /// Child nodes for hierarchical constructs.
92    pub children: Vec<Node>,
93}
94
95/// Table column alignment (GFM tables extension).
96#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
97pub enum TableAlignment {
98    /// No explicit alignment.
99    #[default]
100    None,
101    /// Left-aligned column.
102    Left,
103    /// Center-aligned column.
104    Center,
105    /// Right-aligned column.
106    Right,
107}
108
109/// GitHub-style admonitions / alerts (GFM extension).
110///
111/// Syntax is based on blockquotes, e.g.
112///
113/// `> [!NOTE]`
114/// `> body...`
115#[derive(Debug, Clone, Copy, PartialEq, Eq)]
116pub enum AdmonitionKind {
117    /// Note admonition kind.
118    Note,
119    /// Tip admonition kind.
120    Tip,
121    /// Important admonition kind.
122    Important,
123    /// Warning admonition kind.
124    Warning,
125    /// Caution admonition kind.
126    Caution,
127}
128
129/// Rendering style for admonitions.
130///
131/// - `Alert`: Standard GitHub-style alert coloring (NOTE/TIP/WARNING/etc).
132/// - `Quote`: Quote-colored styling (neutral border/colors like regular blockquotes) while
133///   keeping the admonition title layout.
134#[derive(Debug, Clone, Copy, PartialEq, Eq)]
135pub enum AdmonitionStyle {
136    /// GitHub alert style.
137    Alert,
138    /// Quote-like neutral style.
139    Quote,
140}
141
142#[derive(Debug, Clone)]
143/// All supported block and inline AST node kinds.
144pub enum NodeKind {
145    /// ATX or setext heading node.
146    Heading {
147        /// Heading level, typically in range 1..=6.
148        level: u8,
149        /// Plain heading text content.
150        text: String,
151        /// Explicit heading id, e.g. `### Title {#custom-id}`.
152        ///
153        /// When present, the renderer should emit it as `id="..."` on the
154        /// heading element.
155        id: Option<String>,
156    },
157    /// Paragraph container.
158    Paragraph,
159    /// Fenced or indented code block.
160    CodeBlock {
161        /// Optional language/info string.
162        language: Option<String>,
163        /// Raw code block contents.
164        code: String,
165    },
166    /// Horizontal rule (`---`, `***`, `___`).
167    ThematicBreak,
168    /// Ordered or unordered list container.
169    List {
170        /// Whether this is an ordered list.
171        ordered: bool,
172        /// Starting number for ordered lists.
173        start: Option<u32>,
174        /// Whether list items are tight (no blank separators).
175        tight: bool,
176    },
177    /// List item container.
178    ListItem,
179
180    /// Extended definition lists (Markdown Guide / Markdown Extra-style).
181    ///
182    /// Rendering convention:
183    /// - A `DefinitionList` contains alternating `DefinitionTerm` (`<dt>`) and
184    ///   `DefinitionDescription` (`<dd>`) children.
185    /// - `DefinitionTerm` should contain inline children.
186    /// - `DefinitionDescription` should contain block children.
187    DefinitionList,
188    /// Definition term (`dt`) item.
189    DefinitionTerm,
190    /// Definition description (`dd`) item.
191    DefinitionDescription,
192    /// GFM task list checkbox marker for a list item.
193    ///
194    /// This is emitted by the list parser when a list item begins with
195    /// `[ ]` or `[x]` / `[X]`.
196    ///
197    /// Rendering convention:
198    /// - This node is expected to appear as the first child inside a `ListItem`.
199    /// - The HTML renderer will convert it into a themed checkbox icon.
200    TaskCheckbox {
201        /// Whether checkbox is checked.
202        checked: bool,
203    },
204    /// Blockquote container.
205    Blockquote,
206    /// GitHub-style admonition / alert (GFM extension).
207    ///
208    /// This is created by a post-parse transformation that recognizes a special
209    /// first line inside a blockquote (e.g. `[!NOTE]`) and removes that marker.
210    Admonition {
211        /// Admonition semantic kind.
212        kind: AdmonitionKind,
213        /// Optional custom title for the admonition header.
214        ///
215        /// Used by extended GFM-style admonitions (e.g. `> [๐Ÿ˜‚ Happy Header]`).
216        title: Option<String>,
217        /// Optional custom icon content (typically a Unicode emoji) for the title.
218        ///
219        /// Rendered as text (not SVG) and must be styled by CSS.
220        icon: Option<String>,
221        /// Render variant.
222        style: AdmonitionStyle,
223    },
224
225    /// Extended tab blocks.
226    ///
227    /// Syntax (container + items):
228    /// ```text
229    /// :::tab
230    /// @tab Title
231    /// Content...
232    /// :::
233    /// ```
234    ///
235    /// Children convention:
236    /// - A `TabGroup` contains one or more `TabItem` children.
237    /// - Each `TabItem` contains block children representing the tab panel content.
238    TabGroup,
239    /// A single tab item inside a tab group.
240    TabItem {
241        /// User-visible tab title.
242        title: String,
243    },
244
245    /// Extended slide decks (Reveal.js-like syntax, rendered as a simple slideshow).
246    ///
247    /// Syntax:
248    /// ```text
249    /// @slidestart
250    /// slide 1
251    /// ---
252    /// slide 2
253    /// @slideend
254    /// ```
255    ///
256    /// Optional timer (seconds per slide): `@slidestart:t5`.
257    ///
258    /// Children convention:
259    /// - A `SliderDeck` contains one or more `Slide` children.
260    /// - Each `Slide` contains block children representing the slide content.
261    SliderDeck {
262        /// Optional per-slide timer value in seconds.
263        timer_seconds: Option<u32>,
264    },
265    /// A single slide inside a slider deck.
266    Slide {
267        /// True if this slide started after a vertical separator (`--`).
268        ///
269        /// The current viewer treats slides as a single linear sequence
270        /// (left/right). This flag is preserved for future vertical navigation.
271        vertical: bool,
272    },
273    /// GFM table (pipe table extension).
274    ///
275    /// Children convention:
276    /// - Each child is a `TableRow`.
277    /// - Each `TableRow` contains `TableCell` children.
278    Table {
279        /// Per-column alignments.
280        alignments: Vec<TableAlignment>,
281    },
282    /// A single table row.
283    TableRow {
284        /// Whether this row is part of the table header.
285        header: bool,
286    },
287    /// A single table cell.
288    TableCell {
289        /// Whether this cell is in a header row.
290        header: bool,
291        /// Effective alignment for this cell.
292        alignment: TableAlignment,
293    },
294    /// Raw block-level HTML fragment.
295    HtmlBlock {
296        /// Raw HTML source.
297        html: String,
298    },
299
300    /// GFM-style footnote definition (extension).
301    ///
302    /// Syntax:
303    /// - `[^label]: definition text`
304    /// - Continuation lines may be indented.
305    ///
306    /// Rendering convention:
307    /// - This node should not be rendered in place.
308    /// - Instead, the renderer collects referenced footnotes and emits a
309    ///   footnotes section at the end of the document.
310    FootnoteDefinition {
311        /// Footnote label (without `[^`/`]`).
312        label: String,
313    },
314
315    /// Plain text inline content.
316    Text(String),
317    /// Inline task checkbox marker (extension).
318    ///
319    /// This is emitted when a paragraph begins with a task marker like
320    /// `[ ] ` / `[x] ` / `[X] `.
321    ///
322    /// Rendering convention:
323    /// - The HTML renderer converts it into the same themed SVG checkbox icon
324    ///   used for task list items.
325    TaskCheckboxInline {
326        /// Whether checkbox is checked.
327        checked: bool,
328    },
329    /// Emphasis inline container.
330    Emphasis,
331    /// Strong emphasis inline container.
332    Strong,
333    /// Combined strong+emphasis, e.g. `***text***` or `___text___`.
334    ///
335    /// This is parsed as a single inline node to avoid leaving dangling
336    /// delimiters that would otherwise be treated as plain text.
337    StrongEmphasis,
338    /// Strikethrough (extension), e.g. `~~text~~`.
339    Strikethrough,
340    /// Highlight/mark (extension), e.g. `==text==`.
341    Mark,
342    /// Superscript (extension), e.g. `^text^`.
343    Superscript,
344    /// Subscript (extension), e.g. `~text~`.
345    Subscript,
346    /// Inline link node.
347    Link {
348        /// Link destination URL.
349        url: String,
350        /// Optional link title.
351        title: Option<String>,
352    },
353    /// Reference-style link placeholder (CommonMark): `[text][label]`, `[label][]`, `[label]`.
354    ///
355    /// These cannot be fully resolved during inline parsing because reference
356    /// definitions may appear later in the document. The top-level `parse()`
357    /// performs a post-processing pass that converts this into a `Link` when a
358    /// matching definition exists in `Document.references`.
359    ///
360    /// If no matching definition is found, this should be rendered as literal
361    /// bracketed text (preserving the already-parsed `children` for the first
362    /// bracketed segment).
363    LinkReference {
364        /// Label used for reference resolution (will be normalized when looked up).
365        label: String,
366        /// Extra literal suffix after the first `]` (e.g. `"[]"` or `"[label]"`).
367        /// Empty for shortcut reference links.
368        suffix: String,
369    },
370
371    /// GFM-style footnote reference (extension), e.g. `[^label]`.
372    ///
373    /// Rendering convention:
374    /// - If a matching `FootnoteDefinition` exists, this renders as a numbered
375    ///   superscript link.
376    /// - Otherwise it should fall back to literal text.
377    FootnoteReference {
378        /// Referenced footnote label.
379        label: String,
380    },
381    /// Inline image node.
382    Image {
383        /// Image source URL.
384        url: String,
385        /// Image alt text.
386        alt: String,
387    },
388    /// Inline code span.
389    CodeSpan(String),
390    /// Inline HTML fragment.
391    InlineHtml(String),
392    /// Hard line break (two spaces + newline or backslash + newline).
393    HardBreak,
394    /// Soft line break.
395    SoftBreak,
396
397    /// Extended user mentions.
398    ///
399    /// Syntax:
400    /// - `@username[platform]`
401    /// - `@username[platform](Display Name)`
402    ///
403    /// Rendering policy:
404    /// - The renderer may convert this to an external profile link based on
405    ///   a platform mapping table.
406    PlatformMention {
407        /// Platform username/handle.
408        username: String,
409        /// Platform key, for example `github`.
410        platform: String,
411        /// Optional display label override.
412        display: Option<String>,
413    },
414    /// Inline math (LaTeX), e.g. `$E = mc^2$`.
415    ///
416    /// Rendering policy:
417    /// - Rendered using KaTeX in inline mode.
418    /// - Content is raw LaTeX source code.
419    InlineMath {
420        /// Raw inline LaTeX content.
421        content: String,
422    },
423
424    /// Display math (LaTeX), e.g. `$$\int_0^\infty e^{-x^2} dx$$`.
425    ///
426    /// Rendering policy:
427    /// - Rendered using KaTeX in display mode.
428    /// - Content is raw LaTeX source code.
429    DisplayMath {
430        /// Raw display LaTeX content.
431        content: String,
432    },
433
434    /// Mermaid diagram (code block with language="mermaid").
435    ///
436    /// Rendering policy:
437    /// - Rendered using mermaid-rs-renderer to SVG.
438    /// - Content is raw Mermaid diagram source code.
439    ///
440    /// This is created during parsing when a fenced code block has
441    /// info string "mermaid".
442    MermaidDiagram {
443        /// Raw Mermaid diagram source.
444        content: String,
445    },
446}
447
448impl Document {
449    /// Create an empty document.
450    pub fn new() -> Self {
451        Self::default()
452    }
453
454    /// Number of top-level nodes in the document.
455    pub fn len(&self) -> usize {
456        self.children.len()
457    }
458
459    /// Returns `true` when the document has no top-level nodes.
460    pub fn is_empty(&self) -> bool {
461        self.children.is_empty()
462    }
463}
464
465#[cfg(test)]
466mod tests {
467    use super::ReferenceMap;
468
469    #[test]
470    fn smoke_test_reference_map_first_definition_wins() {
471        let mut refs = ReferenceMap::new();
472        refs.insert("foo", "https://first.example".to_string(), None);
473        refs.insert("foo", "https://second.example".to_string(), None);
474
475        let (url, title) = refs.get("foo").expect("reference not found");
476        assert_eq!(url, "https://first.example");
477        assert_eq!(title, &None);
478    }
479
480    #[test]
481    fn smoke_test_reference_map_casefold_sharp_s() {
482        let mut refs = ReferenceMap::new();
483        refs.insert("SS", "/url".to_string(), None);
484
485        let (url, _) = refs.get("แบž").expect("reference not found");
486        assert_eq!(url, "/url");
487    }
488
489    #[test]
490    fn smoke_test_reference_map_whitespace_collapse() {
491        let mut refs = ReferenceMap::new();
492        refs.insert("Foo\n\t  bar", "/url".to_string(), None);
493
494        assert!(refs.contains("foo bar"));
495        assert!(refs.contains("  FOO   BAR  "));
496    }
497}