use crate::format::DocumentFormat;
use crate::ir::{
DocumentIR, Element, Heading, InlineContent, List, ListItem, Metadata, Paragraph, Section,
Table, TableCell, TableRow, TextSpan,
};
impl DocumentIR {
pub fn from_markdown(markdown: &str, format: DocumentFormat) -> Self {
let mut parser = MarkdownParser::new(markdown);
let sections = parser.parse_sections();
DocumentIR {
metadata: Metadata {
format,
title: None,
..Default::default()
},
sections,
}
}
}
struct MarkdownParser<'a> {
lines: Vec<&'a str>,
pos: usize,
}
impl<'a> MarkdownParser<'a> {
fn new(src: &'a str) -> Self {
Self {
lines: src.lines().collect(),
pos: 0,
}
}
fn peek(&self) -> Option<&'a str> {
self.lines.get(self.pos).copied()
}
fn advance(&mut self) -> Option<&'a str> {
let line = self.lines.get(self.pos).copied();
self.pos += 1;
line
}
fn parse_sections(&mut self) -> Vec<Section> {
let mut sections: Vec<Section> = Vec::new();
let mut current = Section {
title: None,
elements: Vec::new(),
..Default::default()
};
while self.pos < self.lines.len() {
let line = match self.peek() {
Some(l) => l,
None => break,
};
if line.trim().is_empty() {
self.advance();
continue;
}
if is_thematic_break(line) {
self.advance();
if !current.elements.is_empty() || current.title.is_some() {
sections.push(current);
current = Section {
title: None,
elements: Vec::new(),
..Default::default()
};
}
continue;
}
if let Some((level, text)) = parse_atx_heading(line) {
self.advance();
if level == 1 {
if !current.elements.is_empty() || current.title.is_some() {
sections.push(current);
}
current = Section {
title: Some(text.clone()),
elements: Vec::new(),
..Default::default()
};
} else {
current.elements.push(Element::Heading(Heading {
level,
content: parse_inline(&text),
}));
}
continue;
}
if line.trim_start().starts_with('|') {
if let Some(table) = self.parse_table() {
current.elements.push(Element::Table(table));
continue;
}
}
if is_unordered_list_marker(line) {
let list = self.parse_list(false);
current.elements.push(Element::List(list));
continue;
}
if is_ordered_list_marker(line) {
let list = self.parse_list(true);
current.elements.push(Element::List(list));
continue;
}
let para = self.parse_paragraph();
if !para.content.is_empty() {
current.elements.push(Element::Paragraph(para));
}
}
if !current.elements.is_empty() || current.title.is_some() {
sections.push(current);
}
if sections.is_empty() {
sections.push(Section {
title: None,
elements: Vec::new(),
..Default::default()
});
}
sections
}
fn parse_paragraph(&mut self) -> Paragraph {
let mut lines: Vec<&str> = Vec::new();
loop {
match self.peek() {
None => break,
Some(line) => {
if line.trim().is_empty()
|| parse_atx_heading(line).is_some()
|| is_thematic_break(line)
|| line.trim_start().starts_with('|')
|| is_unordered_list_marker(line)
|| is_ordered_list_marker(line)
{
break;
}
lines.push(line);
self.advance();
},
}
}
let text = lines.join(" ");
Paragraph {
content: parse_inline(&text),
..Default::default()
}
}
fn parse_table(&mut self) -> Option<Table> {
let mut raw: Vec<&'a str> = Vec::new();
while let Some(line) = self.peek() {
if line.trim_start().starts_with('|') {
raw.push(line);
self.advance();
} else {
break;
}
}
if raw.is_empty() {
return None;
}
let data_rows: Vec<&str> = raw
.iter()
.copied()
.filter(|line| !is_table_separator_row(line))
.collect();
if data_rows.is_empty() {
return None;
}
let mut rows: Vec<TableRow> = Vec::new();
for (i, row_line) in data_rows.iter().enumerate() {
let cells = split_pipe_row(row_line)
.into_iter()
.map(|cell_text| TableCell {
content: vec![Element::Paragraph(Paragraph {
content: parse_inline(cell_text.trim()),
..Default::default()
})],
col_span: 1,
row_span: 1,
..Default::default()
})
.collect();
rows.push(TableRow {
cells,
is_header: i == 0,
..Default::default()
});
}
Some(Table {
rows,
..Default::default()
})
}
fn parse_list(&mut self, ordered: bool) -> List {
let mut items: Vec<ListItem> = Vec::new();
loop {
match self.peek() {
None => break,
Some(line) => {
if ordered && !is_ordered_list_marker(line) {
break;
}
if !ordered && !is_unordered_list_marker(line) {
break;
}
self.advance();
let content_str = strip_list_marker(line);
items.push(ListItem {
content: vec![Element::Paragraph(Paragraph {
content: parse_inline(content_str),
..Default::default()
})],
nested: None,
});
},
}
}
List {
ordered,
items,
..Default::default()
}
}
}
fn parse_inline(text: &str) -> Vec<InlineContent> {
let mut out: Vec<InlineContent> = Vec::new();
let bytes = text.as_bytes();
let len = text.len();
let mut plain_start = 0usize;
macro_rules! flush_plain {
($end:expr) => {
if plain_start < $end {
let t = &text[plain_start..$end];
if !t.is_empty() {
out.push(InlineContent::Text(TextSpan::plain(t)));
}
}
};
}
let mut i = 0usize;
while i < len {
if i + 1 < len
&& ((bytes[i] == b'*' && bytes[i + 1] == b'*')
|| (bytes[i] == b'_' && bytes[i + 1] == b'_'))
{
let marker = &text[i..i + 2];
if let Some(end) = text[i + 2..].find(marker) {
flush_plain!(i);
let inner = &text[i + 2..i + 2 + end];
out.push(InlineContent::Text(TextSpan {
text: inner.to_string(),
bold: true,
..Default::default()
}));
i += 2 + end + 2;
plain_start = i;
continue;
}
}
if (bytes[i] == b'*' || bytes[i] == b'_') && i + 1 < len && bytes[i + 1] != bytes[i] {
let marker = &text[i..i + 1];
if let Some(end) = text[i + 1..].find(marker) {
flush_plain!(i);
let inner = &text[i + 1..i + 1 + end];
out.push(InlineContent::Text(TextSpan {
text: inner.to_string(),
italic: true,
..Default::default()
}));
i += 1 + end + 1;
plain_start = i;
continue;
}
}
if i + 1 < len && bytes[i] == b'~' && bytes[i + 1] == b'~' {
if let Some(end) = text[i + 2..].find("~~") {
flush_plain!(i);
let inner = &text[i + 2..i + 2 + end];
out.push(InlineContent::Text(TextSpan {
text: inner.to_string(),
strikethrough: true,
..Default::default()
}));
i += 2 + end + 2;
plain_start = i;
continue;
}
}
if bytes[i] == b'`' {
if let Some(end) = text[i + 1..].find('`') {
flush_plain!(i);
let inner = &text[i + 1..i + 1 + end];
out.push(InlineContent::Text(TextSpan::plain(inner)));
i += 1 + end + 1;
plain_start = i;
continue;
}
}
if bytes[i] == b'[' {
if let Some(bracket_end) = text[i + 1..].find(']') {
let after_bracket = i + 1 + bracket_end + 1;
if after_bracket < len && bytes[after_bracket] == b'(' {
if let Some(paren_end) = text[after_bracket + 1..].find(')') {
flush_plain!(i);
let link_text = &text[i + 1..i + 1 + bracket_end];
let url = &text[after_bracket + 1..after_bracket + 1 + paren_end];
out.push(InlineContent::Text(TextSpan {
text: link_text.to_string(),
hyperlink: Some(url.to_string()),
..Default::default()
}));
i = after_bracket + 1 + paren_end + 1;
plain_start = i;
continue;
}
}
}
}
i += text[i..].chars().next().map(|c| c.len_utf8()).unwrap_or(1);
}
flush_plain!(len);
out
}
fn parse_atx_heading(line: &str) -> Option<(u8, String)> {
let trimmed = line.trim_start();
let hashes = trimmed.bytes().take_while(|&b| b == b'#').count();
if hashes == 0 || hashes > 6 {
return None;
}
let rest = &trimmed[hashes..];
if rest.is_empty() || rest.starts_with(' ') || rest.starts_with('\t') {
let text = rest.trim().trim_end_matches('#').trim().to_string();
Some((hashes as u8, text))
} else {
None
}
}
fn is_thematic_break(line: &str) -> bool {
let t = line.trim();
if t.len() < 3 {
return false;
}
let Some(ch) = t.chars().next() else {
return false;
};
if !matches!(ch, '-' | '*' | '_') {
return false;
}
t.chars().all(|c| c == ch || c == ' ') && t.chars().filter(|&c| c == ch).count() >= 3
}
fn is_table_separator_row(line: &str) -> bool {
let trimmed = line.trim().trim_matches('|');
trimmed.split('|').all(|cell| {
let c = cell.trim().trim_start_matches(':').trim_end_matches(':');
!c.is_empty() && c.bytes().all(|b| b == b'-')
})
}
fn split_pipe_row(line: &str) -> Vec<&str> {
let inner = line.trim().trim_start_matches('|').trim_end_matches('|');
inner.split('|').collect()
}
fn is_unordered_list_marker(line: &str) -> bool {
let t = line.trim_start();
(t.starts_with("- ") || t.starts_with("* ") || t.starts_with("+ ")) && !is_thematic_break(line)
}
fn is_ordered_list_marker(line: &str) -> bool {
let t = line.trim_start();
let num_end = t.bytes().take_while(|b| b.is_ascii_digit()).count();
if num_end == 0 {
return false;
}
let after = &t[num_end..];
after.starts_with(". ") || after.starts_with(") ")
}
fn strip_list_marker(line: &str) -> &str {
let t = line.trim_start();
if t.starts_with("- ") || t.starts_with("* ") || t.starts_with("+ ") {
t[2..].trim_start()
} else {
let num_end = t.bytes().take_while(|b| b.is_ascii_digit()).count();
t[num_end + 2..].trim_start()
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::format::DocumentFormat;
#[test]
fn parse_heading_paragraph() {
let md = "# Hello\n\nSome text here.\n";
let ir = DocumentIR::from_markdown(md, DocumentFormat::Docx);
assert_eq!(ir.sections.len(), 1);
assert_eq!(ir.sections[0].title.as_deref(), Some("Hello"));
assert!(matches!(ir.sections[0].elements[0], Element::Paragraph(_)));
}
#[test]
fn parse_page_break_into_sections() {
let md = "# Page 1\n\nText one.\n\n---\n\n# Page 2\n\nText two.\n";
let ir = DocumentIR::from_markdown(md, DocumentFormat::Docx);
assert_eq!(ir.sections.len(), 2);
assert_eq!(ir.sections[0].title.as_deref(), Some("Page 1"));
assert_eq!(ir.sections[1].title.as_deref(), Some("Page 2"));
}
#[test]
fn parse_unordered_list() {
let md = "- apple\n- banana\n- cherry\n";
let ir = DocumentIR::from_markdown(md, DocumentFormat::Docx);
let list = match &ir.sections[0].elements[0] {
Element::List(l) => l,
other => panic!("expected List, got {other:?}"),
};
assert!(!list.ordered);
assert_eq!(list.items.len(), 3);
}
#[test]
fn parse_ordered_list() {
let md = "1. first\n2. second\n";
let ir = DocumentIR::from_markdown(md, DocumentFormat::Docx);
let list = match &ir.sections[0].elements[0] {
Element::List(l) => l,
other => panic!("expected List, got {other:?}"),
};
assert!(list.ordered);
}
#[test]
fn parse_pipe_table() {
let md = "| Name | Age |\n|------|-----|\n| Alice | 30 |\n| Bob | 25 |\n";
let ir = DocumentIR::from_markdown(md, DocumentFormat::Docx);
let table = match &ir.sections[0].elements[0] {
Element::Table(t) => t,
other => panic!("expected Table, got {other:?}"),
};
assert_eq!(table.rows.len(), 3); assert!(table.rows[0].is_header);
}
#[test]
fn parse_bold_italic_inline() {
let md = "Hello **world** and *rust*.\n";
let ir = DocumentIR::from_markdown(md, DocumentFormat::Docx);
let para = match &ir.sections[0].elements[0] {
Element::Paragraph(p) => p,
other => panic!("expected Paragraph, got {other:?}"),
};
let spans: Vec<_> = para
.content
.iter()
.filter_map(|c| match c {
InlineContent::Text(s) => Some(s),
_ => None,
})
.collect();
assert!(spans.iter().any(|s| s.bold && s.text == "world"));
assert!(spans.iter().any(|s| s.italic && s.text == "rust"));
}
#[test]
fn parse_empty_markdown() {
let ir = DocumentIR::from_markdown("", DocumentFormat::Docx);
assert_eq!(ir.sections.len(), 1);
assert!(ir.sections[0].elements.is_empty());
}
}