Skip to main content

marco_core/parser/
mod.rs

1// Marco Parser - 100% CommonMark Compliant (652/652 spec examples passing)
2// nom-based parser with full UTF-8 support (em dashes, smart quotes, Japanese, Arabic, emoji)
3
4pub mod ast;
5pub mod position;
6pub mod shared;
7
8// Modular parser structure
9pub mod blocks;
10pub mod inlines;
11
12// Re-export public API
13pub use ast::*;
14pub use blocks::parse_blocks;
15pub use inlines::parse_inlines;
16pub use position::*;
17
18/// Parse Markdown text into Document AST
19pub fn parse(input: &str) -> Result<Document, Box<dyn std::error::Error>> {
20    log::info!("Starting parse: {} bytes", input.len());
21
22    let mut document = parse_blocks(input)?;
23    log::debug!("Parsed {} blocks", document.children.len());
24
25    // Second pass: resolve reference-style links against collected reference definitions.
26    // This must happen after block parsing because definitions may appear later.
27    resolve_reference_links(&mut document);
28
29    // Third pass: transform top-level GitHub-style alert blockquotes (`> [!NOTE]`).
30    blocks::gfm_admonitions::apply_gfm_admonitions(&mut document);
31
32    Ok(document)
33}
34
35fn resolve_reference_links(document: &mut Document) {
36    resolve_reference_links_in_nodes(&mut document.children, &document.references);
37}
38
39fn unescape_commonmark_backslash_escapes(input: &str) -> String {
40    // CommonMark escapable punctuation set.
41    const ESCAPABLE: &str = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~";
42
43    let mut out = String::with_capacity(input.len());
44    let mut chars = input.chars().peekable();
45
46    while let Some(ch) = chars.next() {
47        if ch == '\\' {
48            if let Some(&next) = chars.peek() {
49                if ESCAPABLE.contains(next) {
50                    out.push(next);
51                    chars.next();
52                    continue;
53                }
54            }
55        }
56
57        out.push(ch);
58    }
59
60    out
61}
62
63fn resolve_reference_links_in_nodes(nodes: &mut Vec<Node>, references: &ReferenceMap) {
64    let mut i = 0;
65    while i < nodes.len() {
66        // Always resolve inside children first.
67        if !nodes[i].children.is_empty() {
68            resolve_reference_links_in_nodes(&mut nodes[i].children, references);
69        }
70
71        let is_ref = matches!(nodes[i].kind, NodeKind::LinkReference { .. });
72        if !is_ref {
73            i += 1;
74            continue;
75        }
76
77        // Temporarily take ownership of data we might need.
78        let (label, suffix) = match &nodes[i].kind {
79            NodeKind::LinkReference { label, suffix } => (label.clone(), suffix.clone()),
80            _ => unreachable!(),
81        };
82
83        if let Some((url, title)) = references.get(&label) {
84            nodes[i].kind = NodeKind::Link {
85                url: url.clone(),
86                title: title.clone(),
87            };
88            i += 1;
89            continue;
90        }
91
92        // Unresolved reference: fall back to literal bracketed text while preserving
93        // already-parsed children for the first bracket segment.
94        let mut inner_children = std::mem::take(&mut nodes[i].children);
95
96        let mut replacement: Vec<Node> = Vec::new();
97        replacement.push(Node {
98            kind: NodeKind::Text("[".to_string()),
99            span: None,
100            children: Vec::new(),
101        });
102        replacement.append(&mut inner_children);
103        replacement.push(Node {
104            kind: NodeKind::Text("]".to_string()),
105            span: None,
106            children: Vec::new(),
107        });
108        if !suffix.is_empty() {
109            replacement.push(Node {
110                kind: NodeKind::Text(unescape_commonmark_backslash_escapes(&suffix)),
111                span: None,
112                children: Vec::new(),
113            });
114        }
115
116        let replacement_len = replacement.len();
117        nodes.splice(i..i + 1, replacement);
118        i += replacement_len;
119    }
120}