pub mod shared;
pub mod cm_blockquote_parser;
pub mod cm_fenced_code_block_parser;
pub mod cm_heading_parser;
pub mod cm_html_blocks_parser;
pub mod cm_indented_code_block_parser;
pub mod cm_link_reference_parser;
pub mod cm_list_parser;
pub mod cm_paragraph_parser;
pub mod cm_thematic_break_parser;
pub mod gfm_admonitions;
pub mod gfm_footnote_definition_parser;
pub mod gfm_table_parser;
pub mod marco_headerless_table_parser;
pub mod marco_sliders_parser;
pub mod marco_tab_blocks_parser;
pub use shared::{dedent_list_item_content, to_parser_span, to_parser_span_range, GrammarSpan};
use super::ast::Document;
use crate::grammar::blocks as grammar;
use crate::parser::ast::{Node, NodeKind};
use nom::Input;
#[derive(Debug, Clone, PartialEq)]
enum BlockContextKind {
ListItem { content_indent: usize },
}
#[derive(Debug, Clone)]
struct BlockContext {
kind: BlockContextKind,
}
impl BlockContext {
pub fn new_list_item(content_indent: usize) -> Self {
Self {
kind: BlockContextKind::ListItem { content_indent },
}
}
fn can_continue_at(&self, indent: usize) -> bool {
match self.kind {
BlockContextKind::ListItem { content_indent } => {
indent >= content_indent
}
}
}
}
struct ParserState {
blocks: Vec<BlockContext>,
allow_tab_blocks: bool,
allow_sliders: bool,
}
impl ParserState {
fn new() -> Self {
Self {
blocks: Vec::new(),
allow_tab_blocks: true,
allow_sliders: true,
}
}
fn new_with_tab_blocks(allow_tab_blocks: bool) -> Self {
Self {
blocks: Vec::new(),
allow_tab_blocks,
allow_sliders: true,
}
}
fn new_with_sliders(allow_sliders: bool) -> Self {
Self {
blocks: Vec::new(),
allow_tab_blocks: true,
allow_sliders,
}
}
pub fn push_block(&mut self, context: BlockContext) {
self.blocks.push(context);
}
fn pop_block(&mut self) -> Option<BlockContext> {
self.blocks.pop()
}
fn can_continue_at(&self, indent: usize) -> bool {
if let Some(context) = self.blocks.last() {
context.can_continue_at(indent)
} else {
false
}
}
fn close_blocks_until_indent(&mut self, indent: usize) -> usize {
let mut closed = 0;
while let Some(context) = self.blocks.last() {
if context.can_continue_at(indent) {
break;
} else {
self.blocks.pop();
closed += 1;
}
}
closed
}
}
pub fn parse_blocks(input: &str) -> Result<Document, Box<dyn std::error::Error>> {
let mut state = ParserState::new();
parse_blocks_internal(input, 0, &mut state)
}
fn parse_blocks_internal(
input: &str,
depth: usize,
state: &mut ParserState,
) -> Result<Document, Box<dyn std::error::Error>> {
const MAX_DEPTH: usize = 100;
if depth > MAX_DEPTH {
log::warn!("Maximum recursion depth reached in block parser");
return Ok(Document::new());
}
log::debug!(
"Block parser input: {} bytes at depth {}, state depth: {}",
input.len(),
depth,
state.blocks.len()
);
let mut nodes = Vec::new();
let mut document = Document::new(); let mut remaining = GrammarSpan::new(input);
let max_iterations = input.lines().count().saturating_mul(8).max(1_000);
let mut iteration_count = 0;
let mut last_offset = 0;
while !remaining.fragment().is_empty() {
iteration_count += 1;
if iteration_count > max_iterations {
log::error!(
"Block parser exceeded iteration limit ({}) at depth {}",
max_iterations,
depth
);
break;
}
let current_offset = remaining.location_offset();
if current_offset == last_offset && iteration_count > 1 {
log::error!(
"Block parser not making progress at offset {}, depth {}",
current_offset,
depth
);
use nom::bytes::complete::take;
let skip_len = remaining
.fragment()
.chars()
.next()
.map(|c| c.len_utf8())
.unwrap_or(1);
if let Ok((rest, _)) =
take::<_, _, nom::error::Error<GrammarSpan>>(skip_len as u32)(remaining)
{
remaining = rest;
last_offset = remaining.location_offset();
continue;
}
break;
}
last_offset = current_offset;
let first_line_end = remaining
.fragment()
.find('\n')
.unwrap_or(remaining.fragment().len());
let first_line = &remaining.fragment()[..first_line_end];
if first_line.chars().all(|c| c == ' ' || c == '\t') {
let peek_offset = if first_line_end < remaining.fragment().len() {
first_line_end + 1
} else {
first_line_end
};
let mut next_nonblank_indent: Option<usize> = None;
let rest_of_input = &remaining.fragment()[peek_offset..];
for peek_line in rest_of_input.lines() {
if !peek_line.trim().is_empty() {
let mut indent = 0;
for ch in peek_line.chars() {
if ch == ' ' {
indent += 1;
} else if ch == '\t' {
indent += 4 - (indent % 4); } else {
break;
}
}
next_nonblank_indent = Some(indent);
break;
}
}
let should_continue = if let Some(next_indent) = next_nonblank_indent {
state.can_continue_at(next_indent)
} else {
false
};
if should_continue {
log::debug!(
"Blank line: continuing context at indent {:?}",
next_nonblank_indent
);
use nom::bytes::complete::take;
let skip_len = if first_line_end < remaining.fragment().len() {
first_line_end + 1 } else {
first_line_end
};
if let Ok((new_remaining, _)) =
take::<_, _, nom::error::Error<GrammarSpan>>(skip_len as u32)(remaining)
{
remaining = new_remaining;
continue;
} else {
break;
}
} else {
if let Some(next_indent) = next_nonblank_indent {
let closed = state.close_blocks_until_indent(next_indent);
log::debug!(
"Blank line: closed {} blocks due to indent {}",
closed,
next_indent
);
} else {
log::debug!("Blank line: end of input, closing all blocks");
while state.pop_block().is_some() {}
}
use nom::bytes::complete::take;
let skip_len = if first_line_end < remaining.fragment().len() {
first_line_end + 1
} else {
first_line_end
};
if let Ok((new_remaining, _)) =
take::<_, _, nom::error::Error<GrammarSpan>>(skip_len as u32)(remaining)
{
remaining = new_remaining;
continue;
} else {
break;
}
}
}
if let Ok((rest, content)) = grammar::html_special_tag(remaining) {
nodes.push(cm_html_blocks_parser::parse_html_block(content));
remaining = rest;
continue;
}
if let Ok((rest, content)) = grammar::html_comment(remaining) {
nodes.push(cm_html_blocks_parser::parse_html_block(content));
remaining = rest;
continue;
}
if let Ok((rest, content)) = grammar::html_processing_instruction(remaining) {
nodes.push(cm_html_blocks_parser::parse_html_block(content));
remaining = rest;
continue;
}
if let Ok((rest, content)) = grammar::html_declaration(remaining) {
nodes.push(cm_html_blocks_parser::parse_html_block(content));
remaining = rest;
continue;
}
if let Ok((rest, content)) = grammar::html_cdata(remaining) {
nodes.push(cm_html_blocks_parser::parse_html_block(content));
remaining = rest;
continue;
}
if let Ok((rest, content)) = grammar::html_block_tag(remaining) {
nodes.push(cm_html_blocks_parser::parse_html_block(content));
remaining = rest;
continue;
}
if let Ok((rest, content)) = grammar::html_complete_tag(remaining) {
nodes.push(cm_html_blocks_parser::parse_html_block(content));
remaining = rest;
continue;
} if let Ok((rest, (level, content))) = grammar::heading(remaining) {
nodes.push(cm_heading_parser::parse_atx_heading(level, content));
remaining = rest;
continue;
}
if let Ok((rest, (language, content))) = grammar::fenced_code_block(remaining) {
nodes.push(cm_fenced_code_block_parser::parse_fenced_code_block(
language, content,
));
remaining = rest;
continue;
}
if let Ok((rest, content)) = grammar::thematic_break(remaining) {
nodes.push(cm_thematic_break_parser::parse_thematic_break(content));
remaining = rest;
continue;
}
if let Ok((rest, content)) = grammar::blockquote(remaining) {
let node =
cm_blockquote_parser::parse_blockquote(content, depth, |cleaned, new_depth| {
parse_blocks_internal(cleaned, new_depth, state)
})?;
nodes.push(node);
remaining = rest;
continue;
}
if let Ok((rest, content)) = grammar::indented_code_block(remaining) {
nodes.push(cm_indented_code_block_parser::parse_indented_code_block(
content,
));
remaining = rest;
continue;
}
if let Ok((rest, items)) = grammar::list(remaining) {
let node = cm_list_parser::parse_list(
items,
depth,
parse_blocks_internal,
|content_indent| {
let mut item_state = ParserState::new();
item_state.push_block(BlockContext::new_list_item(content_indent));
item_state
},
)?;
nodes.push(node);
remaining = rest;
continue;
}
if state.allow_sliders {
let deck_start = remaining;
if let Ok((rest, deck)) = grammar::marco_slide_deck(remaining) {
let node = marco_sliders_parser::parse_marco_slide_deck(
deck,
deck_start,
rest,
depth,
|slide_body, new_depth| {
let mut slide_state = ParserState::new_with_sliders(false);
parse_blocks_internal(slide_body, new_depth, &mut slide_state)
},
)?;
nodes.push(node);
remaining = rest;
continue;
}
}
let full_start = remaining;
if let Ok((rest, (level, content))) = grammar::setext_heading(remaining) {
let full_end = rest;
nodes.push(cm_heading_parser::parse_setext_heading(
level, content, full_start, full_end,
));
remaining = rest;
continue;
}
if let Some((rest, node)) =
gfm_footnote_definition_parser::parse_footnote_definition(remaining)
{
nodes.push(node);
remaining = rest;
continue;
}
if let Ok((rest, (label, url, title))) = grammar::link_reference_definition(remaining) {
cm_link_reference_parser::parse_link_reference(&mut document, &label, url, title);
remaining = rest;
continue;
}
let headerless_table_start = remaining;
if let Ok((rest, table)) = grammar::headerless_table(remaining) {
nodes.push(marco_headerless_table_parser::parse_marco_headerless_table(
table,
headerless_table_start,
rest,
));
remaining = rest;
continue;
}
let table_start = remaining;
if let Ok((rest, table)) = grammar::gfm_table(remaining) {
nodes.push(gfm_table_parser::parse_gfm_table(table, table_start, rest));
remaining = rest;
continue;
}
if state.allow_tab_blocks {
let tab_start = remaining;
if let Ok((rest, block)) = grammar::marco_tab_block(remaining) {
let node = marco_tab_blocks_parser::parse_marco_tab_block(
block,
tab_start,
rest,
depth,
|panel, new_depth| {
let mut panel_state = ParserState::new_with_tab_blocks(false);
parse_blocks_internal(panel, new_depth, &mut panel_state)
},
)?;
nodes.push(node);
remaining = rest;
continue;
}
}
if let Some((rest, node)) = parse_extended_definition_list(remaining, depth) {
nodes.push(node);
remaining = rest;
continue;
}
if let Ok((rest, content)) = grammar::paragraph(remaining) {
nodes.push(cm_paragraph_parser::parse_paragraph(content));
remaining = rest;
continue;
}
log::warn!(
"Could not parse block at offset {}, skipping character",
remaining.location_offset()
);
use nom::bytes::complete::take;
let skip_len = remaining
.fragment()
.chars()
.next()
.map(|c| c.len_utf8())
.unwrap_or(1);
if let Ok((rest, _)) =
take::<_, _, nom::error::Error<GrammarSpan>>(skip_len as u32)(remaining)
{
remaining = rest;
} else {
break;
}
}
log::info!("Parsed {} blocks", nodes.len());
document.children = nodes;
Ok(document)
}
fn parse_extended_definition_list<'a>(
input: GrammarSpan<'a>,
depth: usize,
) -> Option<(GrammarSpan<'a>, Node)> {
let text = input.fragment();
if text.is_empty() {
return None;
}
const CONTINUATION_INDENT: usize = 2;
fn line_bounds(s: &str, start: usize) -> (usize, usize, usize) {
let rel_end = s[start..].find('\n').map(|i| start + i).unwrap_or(s.len());
let next = if rel_end < s.len() {
rel_end + 1
} else {
rel_end
};
(start, rel_end, next)
}
fn count_indent_columns(line: &str) -> usize {
let mut indent = 0usize;
for ch in line.chars() {
if ch == ' ' {
indent += 1;
} else if ch == '\t' {
indent += 4 - (indent % 4);
} else {
break;
}
}
indent
}
fn def_marker_content_start(line: &str) -> Option<usize> {
let bytes = line.as_bytes();
let mut i = 0usize;
for _ in 0..3 {
if bytes.get(i) == Some(&b' ') {
i += 1;
} else {
break;
}
}
if bytes.get(i) != Some(&b':') {
return None;
}
if bytes.get(i + 1) == Some(&b':') {
return None;
}
match bytes.get(i + 1) {
Some(b' ') | Some(b'\t') => {
Some(i + 2)
}
_ => None,
}
}
fn can_start_item_at(text: &str, start: usize) -> bool {
if start >= text.len() {
return false;
}
let (_t0s, t0e, t1s) = line_bounds(text, start);
let term_line = &text[start..t0e];
if term_line.trim().is_empty() {
return false;
}
if t1s >= text.len() {
return false;
}
let (_d0s, d0e, _d1s) = line_bounds(text, t1s);
let def_line = &text[t1s..d0e];
def_marker_content_start(def_line).is_some()
}
let mut children: Vec<Node> = Vec::new();
let mut cursor = 0usize;
let mut parsed_any = false;
loop {
if cursor >= text.len() {
break;
}
let (term_start, term_end, after_term) = line_bounds(text, cursor);
let term_line = &text[term_start..term_end];
if term_line.trim().is_empty() {
break;
}
if after_term >= text.len() {
break;
}
let (def_line_start, def_line_end, _after_def_line) = line_bounds(text, after_term);
let first_def_line = &text[def_line_start..def_line_end];
if def_marker_content_start(first_def_line).is_none() {
break;
}
let term_start_span = input.take_from(term_start);
let (term_after_span, term_taken_span) = term_start_span.take_split(term_end - term_start);
let term_children = match crate::parser::inlines::parse_inlines_from_span(term_taken_span) {
Ok(children) => children,
Err(e) => {
log::warn!("Failed to parse inline elements in definition term: {}", e);
vec![Node {
kind: NodeKind::Text(term_taken_span.fragment().to_string()),
span: crate::parser::shared::opt_span(term_taken_span),
children: Vec::new(),
}]
}
};
children.push(Node {
kind: NodeKind::DefinitionTerm,
span: crate::parser::shared::opt_span_range(term_start_span, term_after_span),
children: term_children,
});
cursor = after_term;
while cursor < text.len() {
let (line_start, line_end, next_line_start) = line_bounds(text, cursor);
let line = &text[line_start..line_end];
let content_start_in_line = match def_marker_content_start(line) {
Some(i) => i,
None => break,
};
let def_block_start = line_start;
let mut def_block_end = next_line_start;
let mut raw_lines: Vec<&str> = Vec::new();
raw_lines.push(&line[content_start_in_line..]);
let mut scan = next_line_start;
while scan < text.len() {
let (ls, le, ln) = line_bounds(text, scan);
let l = &text[ls..le];
if def_marker_content_start(l).is_some() {
break;
}
if l.trim().is_empty() {
let mut look = ln;
let mut next_indent: Option<usize> = None;
while look < text.len() {
let (_pls, ple, pln) = line_bounds(text, look);
let pl = &text[look..ple];
if !pl.trim().is_empty() {
next_indent = Some(count_indent_columns(pl));
break;
}
look = pln;
}
if next_indent.unwrap_or(0) >= CONTINUATION_INDENT {
raw_lines.push("");
scan = ln;
def_block_end = scan;
continue;
}
break;
}
let indent = count_indent_columns(l);
if indent >= CONTINUATION_INDENT {
raw_lines.push(l);
scan = ln;
def_block_end = scan;
continue;
}
break;
}
let raw_body = raw_lines.join("\n");
let dedented = dedent_list_item_content(&raw_body, CONTINUATION_INDENT);
let mut def_state = ParserState::new();
def_state.push_block(BlockContext::new_list_item(CONTINUATION_INDENT));
let def_children = match parse_blocks_internal(&dedented, depth + 1, &mut def_state) {
Ok(doc) => doc.children,
Err(e) => {
log::warn!("Failed to parse definition description blocks: {}", e);
Vec::new()
}
};
let dd_start_span = input.take_from(def_block_start);
let dd_end_span = input.take_from(def_block_end);
children.push(Node {
kind: NodeKind::DefinitionDescription,
span: crate::parser::shared::opt_span_range(dd_start_span, dd_end_span),
children: def_children,
});
parsed_any = true;
cursor = def_block_end;
}
let mut scan = cursor;
while scan < text.len() {
let (_ls, le, ln) = line_bounds(text, scan);
let l = &text[scan..le];
if !l.trim().is_empty() {
break;
}
scan = ln;
}
if scan != cursor && can_start_item_at(text, scan) {
cursor = scan;
continue;
}
break;
}
if !parsed_any {
return None;
}
let (rest, _taken) = input.take_split(cursor);
let span = crate::parser::shared::opt_span_range(input, rest);
Some((
rest,
Node {
kind: NodeKind::DefinitionList,
span,
children,
},
))
}
#[cfg(test)]
mod tests {
use super::parse_blocks;
use crate::parser::ast::NodeKind;
#[test]
fn smoke_test_block_parser_handles_large_documents() {
let count = 250;
let mut input = String::new();
for i in 0..count {
input.push_str(&format!("Paragraph {i}\n\n"));
}
let doc = parse_blocks(&input).expect("parse_blocks failed");
assert_eq!(doc.children.len(), count);
assert!(matches!(
doc.children.last().unwrap().kind,
NodeKind::Paragraph
));
}
}