use crate::common::flat_to_nested::events_to_tree;
use crate::error::FormatError;
use crate::ir::events::Event;
use crate::ir::nodes::{InlineContent, TableCellAlignment};
use comrak::nodes::{AstNode, NodeValue, TableAlignment};
use comrak::{parse_document, Arena, ComrakOptions};
use lex_core::lex::ast::Document;
pub fn parse_from_markdown(source: &str) -> Result<Document, FormatError> {
let arena = Arena::new();
let options = default_comrak_options();
let root = parse_document(&arena, source, &options);
let events = comrak_ast_to_events(root)?;
let ir_doc = events_to_tree(&events).map_err(|e| {
FormatError::ParseError(format!("Failed to build IR tree from events: {e}"))
})?;
let lex_doc = crate::from_ir(&ir_doc);
Ok(lex_doc)
}
fn default_comrak_options() -> ComrakOptions<'static> {
let mut options = ComrakOptions::default();
options.extension.table = true;
options.extension.strikethrough = true;
options.extension.autolink = true;
options.extension.tasklist = true;
options.extension.superscript = true;
options.extension.front_matter_delimiter = Some("---".to_string());
options
}
type DefinitionPieces = Option<(Vec<InlineContent>, Vec<InlineContent>)>;
fn comrak_ast_to_events<'a>(root: &'a AstNode<'a>) -> Result<Vec<Event>, FormatError> {
let mut events = vec![Event::StartDocument];
let mut children_iter = root.children().peekable();
let mut document_title: Option<String> = None;
if let Some(first_child) = children_iter.peek() {
if let NodeValue::Heading(heading) = &first_child.data.borrow().value {
if heading.level == 1 {
let first_child = children_iter.next().unwrap();
let mut title_text = String::new();
for child in first_child.children() {
collect_text_content(child, &mut title_text);
}
document_title = Some(title_text.trim().to_string());
}
}
}
if let Some(title) = document_title {
events.push(Event::StartParagraph);
events.push(Event::Inline(InlineContent::Text(title)));
events.push(Event::EndParagraph);
}
collect_children_with_definitions(children_iter, &mut events)?;
events.push(Event::EndDocument);
Ok(events)
}
fn collect_text_content<'a>(node: &'a AstNode<'a>, output: &mut String) {
match &node.data.borrow().value {
NodeValue::Text(text) => output.push_str(text),
NodeValue::SoftBreak | NodeValue::LineBreak => output.push(' '),
_ => {
for child in node.children() {
collect_text_content(child, output);
}
}
}
}
fn collect_events_from_node<'a>(
node: &'a AstNode<'a>,
events: &mut Vec<Event>,
) -> Result<(), FormatError> {
let node_data = node.data.borrow();
match &node_data.value {
NodeValue::Document => {
collect_children_with_definitions(node.children(), events)?;
}
NodeValue::Heading(heading) => {
let level = heading.level as usize;
events.push(Event::StartHeading(level));
for child in node.children() {
collect_inline_events(child, events)?;
}
}
NodeValue::Paragraph => {
events.push(Event::StartParagraph);
for child in node.children() {
collect_inline_events(child, events)?;
}
events.push(Event::EndParagraph);
}
NodeValue::List(list) => {
let ordered = matches!(list.list_type, comrak::nodes::ListType::Ordered);
let style = if ordered {
crate::ir::nodes::ListStyle::Numeric
} else {
crate::ir::nodes::ListStyle::Bullet
};
events.push(Event::StartList {
ordered,
style,
form: crate::ir::nodes::ListForm::Short,
});
for child in node.children() {
collect_events_from_node(child, events)?;
}
events.push(Event::EndList);
}
NodeValue::Item(_) => {
events.push(Event::StartListItem);
collect_children_with_definitions(node.children(), events)?;
events.push(Event::EndListItem);
}
NodeValue::CodeBlock(code_block) => {
let language = if code_block.info.is_empty() {
None
} else {
Some(code_block.info.clone())
};
events.push(Event::StartVerbatim {
language,
subject: None,
});
events.push(Event::Inline(InlineContent::Text(
code_block.literal.clone(),
)));
events.push(Event::EndVerbatim);
}
NodeValue::HtmlBlock(html) => {
if let Some((label, parameters, content)) = parse_lex_annotation(&html.literal) {
events.push(Event::StartAnnotation {
label: label.clone(),
parameters,
});
if let Some(text) = content {
events.push(Event::StartParagraph);
events.push(Event::Inline(InlineContent::Text(text)));
events.push(Event::EndParagraph);
events.push(Event::EndAnnotation { label });
}
} else if let Some(label) = parse_lex_annotation_close(&html.literal) {
events.push(Event::EndAnnotation { label });
}
}
NodeValue::FrontMatter(content) => {
let yaml_str = content.trim();
let yaml_str = yaml_str
.trim_start_matches("---")
.trim_end_matches("---")
.trim();
let mut parameters = vec![];
for line in yaml_str.lines() {
if let Some((key, value)) = line.split_once(':') {
let key = key.trim().to_string();
let value = value.trim().to_string();
let value = value.trim_matches('"').trim_matches('\'').to_string();
parameters.push((key, value));
}
}
events.push(Event::StartAnnotation {
label: "frontmatter".to_string(),
parameters,
});
events.push(Event::EndAnnotation {
label: "frontmatter".to_string(),
});
}
NodeValue::ThematicBreak => {
}
NodeValue::BlockQuote => {
for child in node.children() {
collect_events_from_node(child, events)?;
}
}
NodeValue::Table(_) => {
events.push(Event::StartTable {
caption: None,
fullwidth: false,
});
for child in node.children() {
collect_events_from_node(child, events)?;
}
events.push(Event::EndTable);
}
NodeValue::TableRow(header) => {
events.push(Event::StartTableRow { header: *header });
for child in node.children() {
collect_events_from_node(child, events)?;
}
events.push(Event::EndTableRow);
}
NodeValue::TableCell => {
let (header, align) = get_table_cell_info(node);
events.push(Event::StartTableCell {
header,
align,
colspan: 1,
rowspan: 1,
});
events.push(Event::StartParagraph);
for child in node.children() {
collect_inline_events(child, events)?;
}
events.push(Event::EndParagraph);
events.push(Event::EndTableCell);
}
_ => {
}
}
Ok(())
}
fn get_table_cell_info<'a>(node: &'a AstNode<'a>) -> (bool, TableCellAlignment) {
let parent = match node.parent() {
Some(p) => p,
None => return (false, TableCellAlignment::None),
};
let is_header = if let NodeValue::TableRow(header) = &parent.data.borrow().value {
*header
} else {
false
};
let mut col_index = 0;
let mut curr = node.previous_sibling();
while let Some(sibling) = curr {
col_index += 1;
curr = sibling.previous_sibling();
}
let grandparent = match parent.parent() {
Some(p) => p,
None => return (is_header, TableCellAlignment::None),
};
let align = if let NodeValue::Table(table) = &grandparent.data.borrow().value {
if col_index < table.alignments.len() {
match table.alignments[col_index] {
TableAlignment::Left => TableCellAlignment::Left,
TableAlignment::Right => TableCellAlignment::Right,
TableAlignment::Center => TableCellAlignment::Center,
TableAlignment::None => TableCellAlignment::None,
}
} else {
TableCellAlignment::None
}
} else {
TableCellAlignment::None
};
(is_header, align)
}
fn collect_inline_events<'a>(
node: &'a AstNode<'a>,
events: &mut Vec<Event>,
) -> Result<(), FormatError> {
let node_data = node.data.borrow();
match &node_data.value {
NodeValue::Text(text) => {
events.push(Event::Inline(InlineContent::Text(text.clone())));
}
NodeValue::Strong => {
let mut children = vec![];
for child in node.children() {
collect_inline_content(child, &mut children)?;
}
events.push(Event::Inline(InlineContent::Bold(children)));
}
NodeValue::Emph => {
let mut children = vec![];
for child in node.children() {
collect_inline_content(child, &mut children)?;
}
events.push(Event::Inline(InlineContent::Italic(children)));
}
NodeValue::Code(code) => {
events.push(Event::Inline(InlineContent::Code(code.literal.clone())));
}
NodeValue::Link(link) => {
let text = collect_text_from_children(node);
events.push(Event::Inline(InlineContent::Link {
text,
href: link.url.clone(),
}));
}
NodeValue::SoftBreak | NodeValue::LineBreak => {
events.push(Event::Inline(InlineContent::Text(" ".to_string())));
}
NodeValue::Image(link) => {
let alt = collect_text_from_children(node);
events.push(Event::Inline(InlineContent::Image(
crate::ir::nodes::Image {
src: link.url.clone(),
alt,
title: if link.title.is_empty() {
None
} else {
Some(link.title.clone())
},
},
)));
}
_ => {
}
}
Ok(())
}
fn collect_text_from_children<'a>(node: &'a AstNode<'a>) -> String {
let mut text = String::new();
for child in node.children() {
collect_text_content(child, &mut text);
}
text
}
fn collect_inline_content<'a>(
node: &'a AstNode<'a>,
content: &mut Vec<InlineContent>,
) -> Result<(), FormatError> {
let node_data = node.data.borrow();
match &node_data.value {
NodeValue::Text(text) => {
content.push(InlineContent::Text(text.clone()));
}
NodeValue::Strong => {
let mut children = vec![];
for child in node.children() {
collect_inline_content(child, &mut children)?;
}
content.push(InlineContent::Bold(children));
}
NodeValue::Emph => {
let mut children = vec![];
for child in node.children() {
collect_inline_content(child, &mut children)?;
}
content.push(InlineContent::Italic(children));
}
NodeValue::Code(code) => {
content.push(InlineContent::Code(code.literal.clone()));
}
NodeValue::Link(link) => {
let text = collect_text_from_children(node);
content.push(InlineContent::Link {
text,
href: link.url.clone(),
});
}
NodeValue::SoftBreak | NodeValue::LineBreak => {
content.push(InlineContent::Text(" ".to_string()));
}
_ => {}
}
Ok(())
}
fn is_heading_node(node: &AstNode<'_>) -> bool {
matches!(node.data.borrow().value, NodeValue::Heading(_))
}
fn try_parse_definition_term<'a>(node: &'a AstNode<'a>) -> Result<DefinitionPieces, FormatError> {
if !matches!(node.data.borrow().value, NodeValue::Paragraph) {
return Ok(None);
}
let mut children = node.children();
let first = match children.next() {
Some(child) => child,
None => return Ok(None),
};
if !matches!(first.data.borrow().value, NodeValue::Strong) {
return Ok(None);
}
let mut term_inlines = Vec::new();
for child in first.children() {
collect_inline_content(child, &mut term_inlines)?;
}
let mut description_inlines = Vec::new();
let mut saw_colon = false;
for child in children {
let child_data = child.data.borrow();
match &child_data.value {
NodeValue::Text(text) => {
if !saw_colon {
let trimmed = text.trim_start();
if let Some(rest) = trimmed.strip_prefix(':') {
saw_colon = true;
let rest = rest.trim_start();
if !rest.is_empty() {
description_inlines.push(InlineContent::Text(rest.to_string()));
}
} else {
return Ok(None);
}
} else if !text.is_empty() {
description_inlines.push(InlineContent::Text(text.clone()));
}
}
NodeValue::Strong
| NodeValue::Emph
| NodeValue::Code(_)
| NodeValue::Link(_)
| NodeValue::SoftBreak
| NodeValue::LineBreak => {
if !saw_colon {
return Ok(None);
}
collect_inline_content(child, &mut description_inlines)?;
}
_ => {
if !saw_colon {
return Ok(None);
}
}
}
}
if !saw_colon {
return Ok(None);
}
Ok(Some((term_inlines, description_inlines)))
}
fn collect_children_with_definitions<'a, I>(
children: I,
events: &mut Vec<Event>,
) -> Result<(), FormatError>
where
I: Iterator<Item = &'a AstNode<'a>>,
{
let mut iter = children.peekable();
while let Some(node) = iter.next() {
if let Some((term_inlines, inline_description)) = try_parse_definition_term(node)? {
events.push(Event::StartDefinition);
events.push(Event::StartDefinitionTerm);
for inline in term_inlines {
events.push(Event::Inline(inline));
}
events.push(Event::EndDefinitionTerm);
events.push(Event::StartDefinitionDescription);
if !inline_description.is_empty() {
events.push(Event::StartParagraph);
for inline in inline_description {
events.push(Event::Inline(inline));
}
events.push(Event::EndParagraph);
}
while let Some(peek) = iter.peek() {
if is_heading_node(peek) {
break;
}
let should_stop = try_parse_definition_term(peek)?.is_some();
if should_stop {
break;
}
let next = iter.next().expect("peek yielded a node");
collect_events_from_node(next, events)?;
}
events.push(Event::EndDefinitionDescription);
events.push(Event::EndDefinition);
} else {
collect_events_from_node(node, events)?;
}
}
Ok(())
}
#[allow(clippy::type_complexity)]
fn parse_lex_annotation(html: &str) -> Option<(String, Vec<(String, String)>, Option<String>)> {
let trimmed = html.trim();
if !trimmed.starts_with("<!-- lex:") || !trimmed.ends_with("-->") {
return None;
}
let content_block = trimmed
.strip_prefix("<!-- lex:")?
.strip_suffix("-->")?
.trim();
let (header, body) = if let Some((h, b)) = content_block.split_once('\n') {
(h, Some(b.trim().to_string()))
} else {
(content_block, None)
};
let parts: Vec<&str> = header.split_whitespace().collect();
if parts.is_empty() {
return None;
}
let label = parts[0].to_string();
let mut parameters = vec![];
for part in &parts[1..] {
if let Some((key, value)) = part.split_once('=') {
parameters.push((key.to_string(), value.to_string()));
}
}
Some((label, parameters, body))
}
fn parse_lex_annotation_close(html: &str) -> Option<String> {
let trimmed = html.trim();
if trimmed.starts_with("<!-- /lex:") && trimmed.ends_with("-->") {
let label = trimmed
.strip_prefix("<!-- /lex:")?
.strip_suffix("-->")?
.trim();
return Some(label.to_string());
}
if trimmed == "<!-- /lex -->" {
return Some(String::new());
}
None
}
#[cfg(test)]
mod tests {
use super::*;
use lex_core::lex::ast::AstNode;
#[test]
fn test_simple_paragraph() {
let md = "This is a simple paragraph.\n";
let doc = parse_from_markdown(md).unwrap();
assert!(!doc.root.children.is_empty());
}
#[test]
fn test_heading_to_session() {
let md = "# Introduction\n\nSome content.\n";
let doc = parse_from_markdown(md).unwrap();
assert!(!doc.root.children.is_empty());
}
#[test]
fn test_code_block_to_verbatim() {
let md = "```rust\nfn main() {}\n```\n";
let doc = parse_from_markdown(md).unwrap();
assert!(!doc.root.children.is_empty());
}
#[test]
fn test_table_parsing() {
let md = "|A|B|\n|-|-|\n|1|2|\n";
let doc = parse_from_markdown(md).unwrap();
println!("Children count: {}", doc.root.children.len());
for child in &doc.root.children {
println!("Child type: {}", child.node_type());
}
let has_table = doc.root.children.iter().any(|c| {
if let lex_core::lex::ast::ContentItem::VerbatimBlock(v) = c {
let mut text = String::new();
for child in &v.children {
if let lex_core::lex::ast::ContentItem::VerbatimLine(l) = child {
text.push_str(l.content.as_string());
text.push('\n');
}
}
text.contains("| --- |")
} else {
false
}
});
assert!(has_table, "Document should contain an aligned table");
}
}