mdx/
parser.rs

1use crate::error::Error;
2use pulldown_cmark::{CodeBlockKind, Event, Options as CmarkOptions, Parser as CmarkParser};
3use serde_yaml;
4use std::collections::HashMap;
5
6/// Options for Markdown parsing
7#[derive(Debug, Clone)]
8pub struct ParseOptions {
9    /// Enable GitHub-flavored markdown
10    pub gfm: bool,
11    /// Enable smart punctuation
12    pub smart_punctuation: bool,
13    /// Parse YAML frontmatter
14    pub frontmatter: bool,
15    /// Enable custom component syntax
16    pub custom_components: bool,
17}
18
19impl Default for ParseOptions {
20    fn default() -> Self {
21        Self {
22            gfm: true,
23            smart_punctuation: true,
24            frontmatter: true,
25            custom_components: true,
26        }
27    }
28}
29
30/// Represents a parsed Markdown document
31#[derive(Debug)]
32pub struct ParsedDocument {
33    /// The AST (Abstract Syntax Tree) of the markdown document
34    pub ast: Vec<Node>,
35    /// Frontmatter metadata if present
36    pub frontmatter: Option<HashMap<String, serde_yaml::Value>>,
37}
38
39/// Represents a node in the Markdown AST
40#[derive(Debug, Clone)]
41pub enum Node {
42    /// A heading with level (1-6) and content
43    Heading {
44        level: u8,
45        content: String,
46        id: String,
47    },
48    /// A paragraph of text
49    Paragraph(Vec<InlineNode>),
50    /// A blockquote
51    BlockQuote(Vec<Node>),
52    /// A code block with optional language
53    CodeBlock {
54        language: Option<String>,
55        content: String,
56        attributes: HashMap<String, String>,
57    },
58    /// A list (ordered or unordered)
59    List {
60        ordered: bool,
61        items: Vec<Vec<Node>>,
62    },
63    /// A thematic break (horizontal rule)
64    ThematicBreak,
65    /// A custom component
66    Component {
67        name: String,
68        attributes: HashMap<String, String>,
69        children: Vec<Node>,
70    },
71    /// Raw HTML
72    Html(String),
73    /// Table
74    Table {
75        headers: Vec<Vec<InlineNode>>,
76        rows: Vec<Vec<Vec<InlineNode>>>,
77        alignments: Vec<Alignment>,
78    },
79}
80
81impl Node {
82    /// Get the name of a component node
83    pub fn name(&self) -> &str {
84        match self {
85            Node::Component { name, .. } => name,
86            _ => "",
87        }
88    }
89
90    /// Get the attributes of a component node
91    pub fn attributes(&self) -> HashMap<String, String> {
92        match self {
93            Node::Component { attributes, .. } => attributes.clone(),
94            _ => HashMap::new(),
95        }
96    }
97
98    /// Get the children of a component node
99    pub fn children(&self) -> Vec<Node> {
100        match self {
101            Node::Component { children, .. } => children.clone(),
102            _ => Vec::new(),
103        }
104    }
105}
106
107/// Represents an inline node in the Markdown AST
108#[derive(Debug, Clone)]
109pub enum InlineNode {
110    /// Plain text
111    Text(String),
112    /// Emphasized text
113    Emphasis(Vec<InlineNode>),
114    /// Strongly emphasized text
115    Strong(Vec<InlineNode>),
116    /// Strikethrough text
117    Strikethrough(Vec<InlineNode>),
118    /// Link
119    Link {
120        text: Vec<InlineNode>,
121        url: String,
122        title: Option<String>,
123    },
124    /// Image
125    Image {
126        alt: String,
127        url: String,
128        title: Option<String>,
129    },
130    /// Inline code
131    Code(String),
132    /// Line break
133    LineBreak,
134    /// HTML entity
135    Html(String),
136}
137
138/// Table column alignment
139#[derive(Debug, Clone, Copy)]
140pub enum Alignment {
141    /// No specific alignment
142    None,
143    /// Left aligned
144    Left,
145    /// Center aligned
146    Center,
147    /// Right aligned
148    Right,
149}
150
151/// Parse a Markdown string into an AST
152pub fn parse(markdown: &str, options: &ParseOptions) -> Result<ParsedDocument, Error> {
153    let mut frontmatter = None;
154    let mut content = markdown.to_string();
155
156    // Process frontmatter if enabled
157    if options.frontmatter && content.starts_with("---") {
158        if let Some((yaml, rest)) = extract_frontmatter(&content) {
159            frontmatter = parse_yaml_frontmatter(yaml)?;
160            content = rest.to_string();
161        }
162    }
163
164    // Configure pulldown-cmark parser options
165    let mut cmark_options = CmarkOptions::empty();
166    if options.gfm {
167        cmark_options.insert(CmarkOptions::ENABLE_TABLES);
168        cmark_options.insert(CmarkOptions::ENABLE_STRIKETHROUGH);
169        cmark_options.insert(CmarkOptions::ENABLE_TASKLISTS);
170    }
171    if options.smart_punctuation {
172        cmark_options.insert(CmarkOptions::ENABLE_SMART_PUNCTUATION);
173    }
174
175    // Parse Markdown content
176    let parser = CmarkParser::new_ext(&content, cmark_options);
177    let ast = process_events(parser, options)?;
178
179    Ok(ParsedDocument { ast, frontmatter })
180}
181
182// Extract YAML frontmatter from Markdown content
183fn extract_frontmatter(content: &str) -> Option<(&str, &str)> {
184    let rest = content.strip_prefix("---")?;
185    let end_index = rest.find("\n---")?;
186    let yaml = &rest[..end_index];
187    let content_start = end_index + 5; // Skip over the ending "---\n"
188
189    if content_start < rest.len() {
190        Some((yaml, &rest[content_start..]))
191    } else {
192        Some((yaml, ""))
193    }
194}
195
196// Parse YAML frontmatter into a HashMap
197fn parse_yaml_frontmatter(yaml: &str) -> Result<Option<HashMap<String, serde_yaml::Value>>, Error> {
198    let frontmatter: HashMap<String, serde_yaml::Value> = serde_yaml::from_str(yaml)?;
199    if frontmatter.is_empty() {
200        Ok(None)
201    } else {
202        Ok(Some(frontmatter))
203    }
204}
205
206// Process pulldown-cmark parser events into our AST
207fn process_events<'a, I>(events: I, options: &ParseOptions) -> Result<Vec<Node>, Error>
208where
209    I: Iterator<Item = Event<'a>>,
210{
211    let mut nodes = Vec::new();
212    let mut current_node: Option<Node> = None;
213    let mut current_inline_nodes: Vec<InlineNode> = Vec::new();
214    let mut list_stack: Vec<(bool, Vec<Vec<Node>>)> = Vec::new();
215    let mut block_quote_stack: Vec<Vec<Node>> = Vec::new();
216    let mut link_stack: Vec<(String, Option<String>, Vec<InlineNode>)> = Vec::new();
217    let mut component_stack: Vec<(String, HashMap<String, String>, Vec<Node>)> = Vec::new();
218    let mut table_headers: Vec<Vec<InlineNode>> = Vec::new();
219    let mut table_alignments: Vec<Alignment> = Vec::new();
220    let mut table_rows: Vec<Vec<Vec<InlineNode>>> = Vec::new();
221    let mut in_table_head = false;
222    let mut in_table_row = false;
223    let mut current_table_row: Vec<Vec<InlineNode>> = Vec::new();
224    let mut current_table_cell: Vec<InlineNode> = Vec::new();
225    let mut _in_emphasis = false;
226    let mut _in_strong = false;
227    let mut _in_strikethrough = false;
228
229    use pulldown_cmark::{Event, Tag};
230
231    let mut events = events.peekable();
232
233    while let Some(event) = events.next() {
234        match event {
235            Event::Start(Tag::Paragraph) => {
236                current_inline_nodes = Vec::new();
237            }
238            Event::End(Tag::Paragraph) => {
239                if !current_inline_nodes.is_empty() {
240                    let node = Node::Paragraph(current_inline_nodes.clone());
241                    current_inline_nodes.clear();
242
243                    if !block_quote_stack.is_empty() {
244                        let last_idx = block_quote_stack.len() - 1;
245                        block_quote_stack[last_idx].push(node);
246                    } else if !list_stack.is_empty() {
247                        let last_list_idx = list_stack.len() - 1;
248                        if let Some(last_item) = list_stack[last_list_idx].1.last_mut() {
249                            last_item.push(node);
250                        }
251                    } else if !component_stack.is_empty() {
252                        let last_idx = component_stack.len() - 1;
253                        component_stack[last_idx].2.push(node);
254                    } else {
255                        nodes.push(node);
256                    }
257                }
258            }
259            Event::Start(Tag::Heading(level, _, _)) => {
260                current_inline_nodes = Vec::new();
261                current_node = Some(Node::Heading {
262                    level: level as u8,
263                    content: String::new(),
264                    id: String::new(),
265                });
266            }
267            Event::End(Tag::Heading(..)) => {
268                if let Some(Node::Heading { level, .. }) = current_node {
269                    // Convert inline nodes to string for the heading content
270                    let mut content = String::new();
271                    for node in &current_inline_nodes {
272                        match node {
273                            InlineNode::Text(text) => content.push_str(text),
274                            InlineNode::Code(code) => content.push_str(code),
275                            _ => {} // Simplified handling
276                        }
277                    }
278
279                    // Generate a slug ID from the heading content
280                    let id = content
281                        .to_lowercase()
282                        .replace(|c: char| !c.is_alphanumeric(), "-")
283                        .replace("--", "-")
284                        .trim_matches('-')
285                        .to_string();
286
287                    let heading = Node::Heading { level, content, id };
288
289                    if !block_quote_stack.is_empty() {
290                        let last_idx = block_quote_stack.len() - 1;
291                        block_quote_stack[last_idx].push(heading);
292                    } else if !component_stack.is_empty() {
293                        let last_idx = component_stack.len() - 1;
294                        component_stack[last_idx].2.push(heading);
295                    } else {
296                        nodes.push(heading);
297                    }
298
299                    current_node = None;
300                    current_inline_nodes.clear();
301                }
302            }
303            Event::Start(Tag::BlockQuote) => {
304                block_quote_stack.push(Vec::new());
305            }
306            Event::End(Tag::BlockQuote) => {
307                if let Some(quote_nodes) = block_quote_stack.pop() {
308                    let node = Node::BlockQuote(quote_nodes);
309
310                    if !block_quote_stack.is_empty() {
311                        let last_idx = block_quote_stack.len() - 1;
312                        block_quote_stack[last_idx].push(node);
313                    } else if !component_stack.is_empty() {
314                        let last_idx = component_stack.len() - 1;
315                        component_stack[last_idx].2.push(node);
316                    } else {
317                        nodes.push(node);
318                    }
319                }
320            }
321            Event::Start(Tag::CodeBlock(kind)) => {
322                let mut language = None;
323                let attributes = HashMap::new();
324
325                if let CodeBlockKind::Fenced(lang) = kind {
326                    let lang_str = lang.to_string();
327                    if !lang_str.is_empty() {
328                        language = Some(lang_str);
329                    }
330                }
331
332                current_node = Some(Node::CodeBlock {
333                    language,
334                    content: String::new(),
335                    attributes,
336                });
337            }
338            Event::End(Tag::CodeBlock(_)) => {
339                if let Some(node) = current_node.take() {
340                    if !block_quote_stack.is_empty() {
341                        let last_idx = block_quote_stack.len() - 1;
342                        block_quote_stack[last_idx].push(node);
343                    } else if !component_stack.is_empty() {
344                        let last_idx = component_stack.len() - 1;
345                        component_stack[last_idx].2.push(node);
346                    } else {
347                        nodes.push(node);
348                    }
349                }
350            }
351            Event::Start(Tag::List(first_item_number)) => {
352                list_stack.push((first_item_number.is_some(), Vec::new()));
353            }
354            Event::End(Tag::List(_)) => {
355                if let Some((ordered, items)) = list_stack.pop() {
356                    let node = Node::List { ordered, items };
357
358                    if !block_quote_stack.is_empty() {
359                        let last_idx = block_quote_stack.len() - 1;
360                        block_quote_stack[last_idx].push(node);
361                    } else if !list_stack.is_empty() {
362                        let last_list_idx = list_stack.len() - 1;
363                        if let Some(last_item) = list_stack[last_list_idx].1.last_mut() {
364                            last_item.push(node);
365                        }
366                    } else if !component_stack.is_empty() {
367                        let last_idx = component_stack.len() - 1;
368                        component_stack[last_idx].2.push(node);
369                    } else {
370                        nodes.push(node);
371                    }
372                }
373            }
374            Event::Start(Tag::Item) => {
375                if !list_stack.is_empty() {
376                    let last_idx = list_stack.len() - 1;
377                    list_stack[last_idx].1.push(Vec::new());
378                }
379            }
380            Event::End(Tag::Item) => {
381                // Handled in the list processing
382            }
383            Event::Text(text) => {
384                if let Some(Node::CodeBlock {
385                    ref mut content, ..
386                }) = current_node
387                {
388                    content.push_str(&text);
389                } else {
390                    current_inline_nodes.push(InlineNode::Text(text.to_string()));
391                }
392            }
393            Event::Code(code) => {
394                current_inline_nodes.push(InlineNode::Code(code.to_string()));
395            }
396            Event::Html(html) => {
397                let html_str = html.to_string();
398
399                // Check for custom component syntax if enabled
400                if options.custom_components && html_str.trim().starts_with("::") {
401                    if html_str.trim().starts_with(":::") {
402                        // Nested component (like tab inside tabs)
403                        if let Some(component_name) = parse_component_start(&html_str) {
404                            let attributes =
405                                extract_component_attributes(&html_str).unwrap_or_default();
406
407                            if !component_stack.is_empty() {
408                                let child_component = (component_name, attributes, Vec::new());
409                                let last_idx = component_stack.len() - 1;
410                                component_stack[last_idx].2.push(Node::Component {
411                                    name: child_component.0.clone(),
412                                    attributes: child_component.1.clone(),
413                                    children: Vec::new(),
414                                });
415                                component_stack.push(child_component);
416                            }
417                        }
418                    } else if let Some(component_name) = parse_component_start(&html_str) {
419                        let attributes =
420                            extract_component_attributes(&html_str).unwrap_or_default();
421                        component_stack.push((component_name, attributes, Vec::new()));
422                    } else if html_str.trim() == "::" || html_str.trim() == ":::" {
423                        // End of component
424                        if let Some((name, attributes, children)) = component_stack.pop() {
425                            let node = Node::Component {
426                                name,
427                                attributes,
428                                children,
429                            };
430
431                            if !component_stack.is_empty() {
432                                let last_idx = component_stack.len() - 1;
433                                // Check if the last child of the parent component is already this component
434                                if let Some(Node::Component {
435                                    name: child_name,
436                                    attributes: child_attrs,
437                                    children: child_children,
438                                }) = component_stack[last_idx].2.last_mut()
439                                {
440                                    if child_name == node.name()
441                                        && *child_attrs == node.attributes()
442                                    {
443                                        // This is already a placeholder for this component - update its children
444                                        *child_children = node.children();
445                                        continue;
446                                    }
447                                }
448                                component_stack[last_idx].2.push(node);
449                            } else if !block_quote_stack.is_empty() {
450                                let last_idx = block_quote_stack.len() - 1;
451                                block_quote_stack[last_idx].push(node);
452                            } else {
453                                nodes.push(node);
454                            }
455                        }
456                    } else {
457                        nodes.push(Node::Html(html_str));
458                    }
459                } else {
460                    nodes.push(Node::Html(html_str));
461                }
462            }
463            Event::Start(Tag::Emphasis) => {
464                let mut emphasis_nodes = Vec::new();
465
466                // Extract emphasized content
467                for next_event in events.by_ref() {
468                    match next_event {
469                        Event::Text(text) => {
470                            emphasis_nodes.push(InlineNode::Text(text.to_string()));
471                        }
472                        Event::End(Tag::Emphasis) => {
473                            break;
474                        }
475                        _ => {} // Simplify other events
476                    }
477                }
478
479                current_inline_nodes.push(InlineNode::Emphasis(emphasis_nodes));
480            }
481            Event::Start(Tag::Strong) => {
482                let mut strong_nodes = Vec::new();
483
484                // Extract strong content
485                for next_event in events.by_ref() {
486                    match next_event {
487                        Event::Text(text) => {
488                            strong_nodes.push(InlineNode::Text(text.to_string()));
489                        }
490                        Event::End(Tag::Strong) => {
491                            break;
492                        }
493                        _ => {} // Simplify other events
494                    }
495                }
496
497                current_inline_nodes.push(InlineNode::Strong(strong_nodes));
498            }
499            Event::Start(Tag::Strikethrough) => {
500                let mut strikethrough_nodes = Vec::new();
501
502                // Extract strikethrough content
503                for next_event in events.by_ref() {
504                    match next_event {
505                        Event::Text(text) => {
506                            strikethrough_nodes.push(InlineNode::Text(text.to_string()));
507                        }
508                        Event::End(Tag::Strikethrough) => {
509                            break;
510                        }
511                        _ => {} // Simplify other events
512                    }
513                }
514
515                current_inline_nodes.push(InlineNode::Strikethrough(strikethrough_nodes));
516            }
517            Event::Start(Tag::Link(_link_type, url, title)) => {
518                let url_str = url.to_string();
519                let title_opt = if title.is_empty() {
520                    None
521                } else {
522                    Some(title.to_string())
523                };
524                link_stack.push((url_str, title_opt, Vec::new()));
525            }
526            Event::End(Tag::Link(_, _, _)) => {
527                if let Some((url, title, text)) = link_stack.pop() {
528                    current_inline_nodes.push(InlineNode::Link { url, title, text });
529                }
530            }
531            Event::Start(Tag::Image(_link_type, url, title)) => {
532                let url_str = url.to_string();
533                let title_opt = if title.is_empty() {
534                    None
535                } else {
536                    Some(title.to_string())
537                };
538                // Collect alt text from next text event
539                if let Some(Event::Text(alt)) = events.next() {
540                    current_inline_nodes.push(InlineNode::Image {
541                        url: url_str,
542                        title: title_opt,
543                        alt: alt.to_string(),
544                    });
545                } else {
546                    current_inline_nodes.push(InlineNode::Image {
547                        url: url_str,
548                        title: title_opt,
549                        alt: String::new(),
550                    });
551                }
552                // Skip the end tag
553                events.next();
554            }
555            Event::SoftBreak | Event::HardBreak => {
556                current_inline_nodes.push(InlineNode::LineBreak);
557            }
558            Event::Start(Tag::Table(alignments)) => {
559                table_headers = Vec::new();
560                table_rows = Vec::new();
561                table_alignments = alignments
562                    .iter()
563                    .map(|a| match a {
564                        pulldown_cmark::Alignment::None => Alignment::None,
565                        pulldown_cmark::Alignment::Left => Alignment::Left,
566                        pulldown_cmark::Alignment::Center => Alignment::Center,
567                        pulldown_cmark::Alignment::Right => Alignment::Right,
568                    })
569                    .collect();
570            }
571            Event::End(Tag::Table(_)) => {
572                let node = Node::Table {
573                    headers: table_headers.clone(),
574                    rows: table_rows.clone(),
575                    alignments: table_alignments.clone(),
576                };
577
578                if !block_quote_stack.is_empty() {
579                    let last_idx = block_quote_stack.len() - 1;
580                    block_quote_stack[last_idx].push(node);
581                } else if !component_stack.is_empty() {
582                    let last_idx = component_stack.len() - 1;
583                    component_stack[last_idx].2.push(node);
584                } else {
585                    nodes.push(node);
586                }
587
588                table_headers.clear();
589                table_rows.clear();
590                table_alignments.clear();
591            }
592            Event::Start(Tag::TableHead) => {
593                in_table_head = true;
594            }
595            Event::End(Tag::TableHead) => {
596                in_table_head = false;
597            }
598            Event::Start(Tag::TableRow) => {
599                in_table_row = true;
600                current_table_row = Vec::new();
601            }
602            Event::End(Tag::TableRow) => {
603                in_table_row = false;
604                if !current_table_row.is_empty() {
605                    if in_table_head {
606                        table_headers = current_table_row.clone();
607                    } else {
608                        table_rows.push(current_table_row.clone());
609                    }
610                    current_table_row.clear();
611                }
612            }
613            Event::Start(Tag::TableCell) => {
614                current_table_cell = Vec::new();
615            }
616            Event::End(Tag::TableCell) => {
617                if in_table_row {
618                    current_table_row.push(current_table_cell.clone());
619                    current_table_cell.clear();
620                }
621            }
622            Event::Rule => {
623                nodes.push(Node::ThematicBreak);
624            }
625            Event::FootnoteReference(_) => {
626                // Not implemented in this example
627            }
628            Event::TaskListMarker(_) => {
629                // Not implemented in this example
630            }
631            // Handle any other events that we haven't explicitly handled
632            _ => {
633                // For simplicity, we'll just ignore other events
634            }
635        }
636    }
637
638    Ok(nodes)
639}
640
641// Helper function to parse component syntax
642fn parse_component_start(html: &str) -> Option<String> {
643    let html = html.trim();
644    if !html.starts_with("::") {
645        return None;
646    }
647
648    let content = if html.starts_with(":::") {
649        html.trim_start_matches(":::")
650    } else {
651        html.trim_start_matches("::")
652    };
653
654    let name_end = content.find('{').unwrap_or(content.len());
655    let name = content[..name_end].trim();
656
657    if name.is_empty() {
658        None
659    } else {
660        Some(name.to_string())
661    }
662}
663
664// Helper function to extract component attributes
665fn extract_component_attributes(html: &str) -> Option<HashMap<String, String>> {
666    let html = html.trim();
667
668    if let Some(start) = html.find('{') {
669        if let Some(end) = html.find('}') {
670            let attrs_str = &html[start + 1..end];
671            let mut attributes = HashMap::new();
672
673            for attr_pair in attrs_str.split_whitespace() {
674                if let Some(equals_pos) = attr_pair.find('=') {
675                    let name = attr_pair[..equals_pos].trim();
676                    let value_with_quotes = attr_pair[equals_pos + 1..].trim();
677                    let value = value_with_quotes
678                        .trim_start_matches('"')
679                        .trim_start_matches('\'')
680                        .trim_end_matches('"')
681                        .trim_end_matches('\'');
682
683                    attributes.insert(name.to_string(), value.to_string());
684                }
685            }
686
687            return Some(attributes);
688        }
689    }
690
691    None
692}