1pub mod shared;
7
8pub mod cm_blockquote_parser;
10pub mod cm_fenced_code_block_parser;
12pub mod cm_heading_parser;
14pub mod cm_html_blocks_parser;
16pub mod cm_indented_code_block_parser;
18pub mod cm_link_reference_parser;
20pub mod cm_list_parser;
22pub mod cm_paragraph_parser;
24pub mod cm_thematic_break_parser;
26pub mod gfm_admonitions;
28pub mod gfm_footnote_definition_parser;
30pub mod gfm_table_parser;
32pub mod marco_headerless_table_parser;
34pub mod marco_sliders_parser;
36pub mod marco_tab_blocks_parser;
38
39pub use shared::{dedent_list_item_content, to_parser_span, to_parser_span_range, GrammarSpan};
41
42use super::ast::Document;
43use crate::grammar::blocks as grammar;
44use crate::parser::ast::{Node, NodeKind};
45use nom::Input;
46
47#[derive(Debug, Clone, PartialEq)]
53enum BlockContextKind {
54 ListItem { content_indent: usize },
57}
58
59#[derive(Debug, Clone)]
61struct BlockContext {
62 kind: BlockContextKind,
63}
64
65impl BlockContext {
66 pub fn new_list_item(content_indent: usize) -> Self {
68 Self {
69 kind: BlockContextKind::ListItem { content_indent },
70 }
71 }
72
73 fn can_continue_at(&self, indent: usize) -> bool {
75 match self.kind {
76 BlockContextKind::ListItem { content_indent } => {
77 indent >= content_indent
79 }
80 }
81 }
82}
83
84struct ParserState {
90 blocks: Vec<BlockContext>,
91 allow_tab_blocks: bool,
92 allow_sliders: bool,
93}
94
95impl ParserState {
96 fn new() -> Self {
97 Self {
98 blocks: Vec::new(),
99 allow_tab_blocks: true,
100 allow_sliders: true,
101 }
102 }
103
104 fn new_with_tab_blocks(allow_tab_blocks: bool) -> Self {
105 Self {
106 blocks: Vec::new(),
107 allow_tab_blocks,
108 allow_sliders: true,
109 }
110 }
111
112 fn new_with_sliders(allow_sliders: bool) -> Self {
113 Self {
114 blocks: Vec::new(),
115 allow_tab_blocks: true,
116 allow_sliders,
117 }
118 }
119
120 pub fn push_block(&mut self, context: BlockContext) {
122 self.blocks.push(context);
123 }
124
125 fn pop_block(&mut self) -> Option<BlockContext> {
127 self.blocks.pop()
128 }
129
130 fn can_continue_at(&self, indent: usize) -> bool {
132 if let Some(context) = self.blocks.last() {
133 context.can_continue_at(indent)
134 } else {
135 false
137 }
138 }
139
140 fn close_blocks_until_indent(&mut self, indent: usize) -> usize {
143 let mut closed = 0;
144
145 while let Some(context) = self.blocks.last() {
147 if context.can_continue_at(indent) {
148 break;
150 } else {
151 self.blocks.pop();
153 closed += 1;
154 }
155 }
156
157 closed
158 }
159}
160
161pub fn parse_blocks(input: &str) -> Result<Document, Box<dyn std::error::Error>> {
167 let mut state = ParserState::new();
168 parse_blocks_internal(input, 0, &mut state)
169}
170
171fn parse_blocks_internal(
173 input: &str,
174 depth: usize,
175 state: &mut ParserState,
176) -> Result<Document, Box<dyn std::error::Error>> {
177 const MAX_DEPTH: usize = 100;
179 if depth > MAX_DEPTH {
180 log::warn!("Maximum recursion depth reached in block parser");
181 return Ok(Document::new());
182 }
183
184 log::debug!(
185 "Block parser input: {} bytes at depth {}, state depth: {}",
186 input.len(),
187 depth,
188 state.blocks.len()
189 );
190
191 let mut nodes = Vec::new();
192 let mut document = Document::new(); let mut remaining = GrammarSpan::new(input);
194
195 let max_iterations = input.lines().count().saturating_mul(8).max(1_000);
199 let mut iteration_count = 0;
200 let mut last_offset = 0;
201
202 while !remaining.fragment().is_empty() {
203 iteration_count += 1;
204 if iteration_count > max_iterations {
205 log::error!(
206 "Block parser exceeded iteration limit ({}) at depth {}",
207 max_iterations,
208 depth
209 );
210 break;
211 }
212
213 let current_offset = remaining.location_offset();
215 if current_offset == last_offset && iteration_count > 1 {
216 log::error!(
217 "Block parser not making progress at offset {}, depth {}",
218 current_offset,
219 depth
220 );
221 use nom::bytes::complete::take;
223 let skip_len = remaining
224 .fragment()
225 .chars()
226 .next()
227 .map(|c| c.len_utf8())
228 .unwrap_or(1);
229 if let Ok((rest, _)) =
230 take::<_, _, nom::error::Error<GrammarSpan>>(skip_len as u32)(remaining)
231 {
232 remaining = rest;
233 last_offset = remaining.location_offset();
234 continue;
235 }
236 break;
237 }
238 last_offset = current_offset;
239
240 let first_line_end = remaining
245 .fragment()
246 .find('\n')
247 .unwrap_or(remaining.fragment().len());
248 let first_line = &remaining.fragment()[..first_line_end];
249
250 if first_line.chars().all(|c| c == ' ' || c == '\t') {
253 let peek_offset = if first_line_end < remaining.fragment().len() {
255 first_line_end + 1
256 } else {
257 first_line_end
258 };
259
260 let mut next_nonblank_indent: Option<usize> = None;
262 let rest_of_input = &remaining.fragment()[peek_offset..];
263
264 for peek_line in rest_of_input.lines() {
265 if !peek_line.trim().is_empty() {
266 let mut indent = 0;
268 for ch in peek_line.chars() {
269 if ch == ' ' {
270 indent += 1;
271 } else if ch == '\t' {
272 indent += 4 - (indent % 4); } else {
274 break;
275 }
276 }
277 next_nonblank_indent = Some(indent);
278 break;
279 }
280 }
281
282 let should_continue = if let Some(next_indent) = next_nonblank_indent {
284 state.can_continue_at(next_indent)
286 } else {
287 false
289 };
290
291 if should_continue {
292 log::debug!(
295 "Blank line: continuing context at indent {:?}",
296 next_nonblank_indent
297 );
298
299 use nom::bytes::complete::take;
300 let skip_len = if first_line_end < remaining.fragment().len() {
301 first_line_end + 1 } else {
303 first_line_end
304 };
305
306 if let Ok((new_remaining, _)) =
307 take::<_, _, nom::error::Error<GrammarSpan>>(skip_len as u32)(remaining)
308 {
309 remaining = new_remaining;
310 continue;
311 } else {
312 break;
313 }
314 } else {
315 if let Some(next_indent) = next_nonblank_indent {
318 let closed = state.close_blocks_until_indent(next_indent);
319 log::debug!(
320 "Blank line: closed {} blocks due to indent {}",
321 closed,
322 next_indent
323 );
324 } else {
325 log::debug!("Blank line: end of input, closing all blocks");
327 while state.pop_block().is_some() {}
328 }
329
330 use nom::bytes::complete::take;
332 let skip_len = if first_line_end < remaining.fragment().len() {
333 first_line_end + 1
334 } else {
335 first_line_end
336 };
337
338 if let Ok((new_remaining, _)) =
339 take::<_, _, nom::error::Error<GrammarSpan>>(skip_len as u32)(remaining)
340 {
341 remaining = new_remaining;
342 continue;
343 } else {
344 break;
345 }
346 }
347 }
348
349 if let Ok((rest, content)) = grammar::html_special_tag(remaining) {
352 nodes.push(cm_html_blocks_parser::parse_html_block(content));
353 remaining = rest;
354 continue;
355 }
356
357 if let Ok((rest, content)) = grammar::html_comment(remaining) {
359 nodes.push(cm_html_blocks_parser::parse_html_block(content));
360 remaining = rest;
361 continue;
362 }
363
364 if let Ok((rest, content)) = grammar::html_processing_instruction(remaining) {
366 nodes.push(cm_html_blocks_parser::parse_html_block(content));
367 remaining = rest;
368 continue;
369 }
370
371 if let Ok((rest, content)) = grammar::html_declaration(remaining) {
373 nodes.push(cm_html_blocks_parser::parse_html_block(content));
374 remaining = rest;
375 continue;
376 }
377
378 if let Ok((rest, content)) = grammar::html_cdata(remaining) {
380 nodes.push(cm_html_blocks_parser::parse_html_block(content));
381 remaining = rest;
382 continue;
383 }
384
385 if let Ok((rest, content)) = grammar::html_block_tag(remaining) {
387 nodes.push(cm_html_blocks_parser::parse_html_block(content));
388 remaining = rest;
389 continue;
390 }
391
392 if let Ok((rest, content)) = grammar::html_complete_tag(remaining) {
395 nodes.push(cm_html_blocks_parser::parse_html_block(content));
396 remaining = rest;
397 continue;
398 } if let Ok((rest, (level, content))) = grammar::heading(remaining) {
400 nodes.push(cm_heading_parser::parse_atx_heading(level, content));
401 remaining = rest;
402 continue;
403 }
404
405 if let Ok((rest, (language, content))) = grammar::fenced_code_block(remaining) {
407 nodes.push(cm_fenced_code_block_parser::parse_fenced_code_block(
408 language, content,
409 ));
410 remaining = rest;
411 continue;
412 }
413
414 if let Ok((rest, content)) = grammar::thematic_break(remaining) {
416 nodes.push(cm_thematic_break_parser::parse_thematic_break(content));
417 remaining = rest;
418 continue;
419 }
420
421 if let Ok((rest, content)) = grammar::blockquote(remaining) {
423 let node =
424 cm_blockquote_parser::parse_blockquote(content, depth, |cleaned, new_depth| {
425 parse_blocks_internal(cleaned, new_depth, state)
426 })?;
427
428 nodes.push(node);
429 remaining = rest;
430 continue;
431 }
432
433 if let Ok((rest, content)) = grammar::indented_code_block(remaining) {
436 nodes.push(cm_indented_code_block_parser::parse_indented_code_block(
437 content,
438 ));
439 remaining = rest;
440 continue;
441 }
442
443 if let Ok((rest, items)) = grammar::list(remaining) {
446 let node = cm_list_parser::parse_list(
447 items,
448 depth,
449 parse_blocks_internal,
450 |content_indent| {
451 let mut item_state = ParserState::new();
452 item_state.push_block(BlockContext::new_list_item(content_indent));
453 item_state
454 },
455 )?;
456
457 nodes.push(node);
458 remaining = rest;
459 continue;
460 }
461
462 if state.allow_sliders {
466 let deck_start = remaining;
467 if let Ok((rest, deck)) = grammar::marco_slide_deck(remaining) {
468 let node = marco_sliders_parser::parse_marco_slide_deck(
469 deck,
470 deck_start,
471 rest,
472 depth,
473 |slide_body, new_depth| {
474 let mut slide_state = ParserState::new_with_sliders(false);
477 parse_blocks_internal(slide_body, new_depth, &mut slide_state)
478 },
479 )?;
480
481 nodes.push(node);
482 remaining = rest;
483 continue;
484 }
485 }
486
487 let full_start = remaining;
490 if let Ok((rest, (level, content))) = grammar::setext_heading(remaining) {
491 let full_end = rest;
492 nodes.push(cm_heading_parser::parse_setext_heading(
493 level, content, full_start, full_end,
494 ));
495 remaining = rest;
496 continue;
497 }
498
499 if let Some((rest, node)) =
502 gfm_footnote_definition_parser::parse_footnote_definition(remaining)
503 {
504 nodes.push(node);
505 remaining = rest;
506 continue;
507 }
508
509 if let Ok((rest, (label, url, title))) = grammar::link_reference_definition(remaining) {
510 cm_link_reference_parser::parse_link_reference(&mut document, &label, url, title);
511 remaining = rest;
512 continue;
513 }
514
515 let headerless_table_start = remaining;
521 if let Ok((rest, table)) = grammar::headerless_table(remaining) {
522 nodes.push(marco_headerless_table_parser::parse_marco_headerless_table(
523 table,
524 headerless_table_start,
525 rest,
526 ));
527 remaining = rest;
528 continue;
529 }
530
531 let table_start = remaining;
532 if let Ok((rest, table)) = grammar::gfm_table(remaining) {
533 nodes.push(gfm_table_parser::parse_gfm_table(table, table_start, rest));
534 remaining = rest;
535 continue;
536 }
537
538 if state.allow_tab_blocks {
541 let tab_start = remaining;
542 if let Ok((rest, block)) = grammar::marco_tab_block(remaining) {
543 let node = marco_tab_blocks_parser::parse_marco_tab_block(
544 block,
545 tab_start,
546 rest,
547 depth,
548 |panel, new_depth| {
549 let mut panel_state = ParserState::new_with_tab_blocks(false);
553 parse_blocks_internal(panel, new_depth, &mut panel_state)
554 },
555 )?;
556
557 nodes.push(node);
558 remaining = rest;
559 continue;
560 }
561 }
562
563 if let Some((rest, node)) = parse_extended_definition_list(remaining, depth) {
566 nodes.push(node);
567 remaining = rest;
568 continue;
569 }
570
571 if let Ok((rest, content)) = grammar::paragraph(remaining) {
573 nodes.push(cm_paragraph_parser::parse_paragraph(content));
574 remaining = rest;
575 continue;
576 }
577
578 log::warn!(
581 "Could not parse block at offset {}, skipping character",
582 remaining.location_offset()
583 );
584 use nom::bytes::complete::take;
585 let skip_len = remaining
586 .fragment()
587 .chars()
588 .next()
589 .map(|c| c.len_utf8())
590 .unwrap_or(1);
591 if let Ok((rest, _)) =
592 take::<_, _, nom::error::Error<GrammarSpan>>(skip_len as u32)(remaining)
593 {
594 remaining = rest;
595 } else {
596 break;
597 }
598 }
599
600 log::info!("Parsed {} blocks", nodes.len());
601
602 document.children = nodes;
604 Ok(document)
605}
606
607fn parse_extended_definition_list<'a>(
628 input: GrammarSpan<'a>,
629 depth: usize,
630) -> Option<(GrammarSpan<'a>, Node)> {
631 let text = input.fragment();
633 if text.is_empty() {
634 return None;
635 }
636
637 const CONTINUATION_INDENT: usize = 2;
638
639 fn line_bounds(s: &str, start: usize) -> (usize, usize, usize) {
640 let rel_end = s[start..].find('\n').map(|i| start + i).unwrap_or(s.len());
642 let next = if rel_end < s.len() {
643 rel_end + 1
644 } else {
645 rel_end
646 };
647 (start, rel_end, next)
648 }
649
650 fn count_indent_columns(line: &str) -> usize {
651 let mut indent = 0usize;
653 for ch in line.chars() {
654 if ch == ' ' {
655 indent += 1;
656 } else if ch == '\t' {
657 indent += 4 - (indent % 4);
658 } else {
659 break;
660 }
661 }
662 indent
663 }
664
665 fn def_marker_content_start(line: &str) -> Option<usize> {
666 let bytes = line.as_bytes();
668 let mut i = 0usize;
669 for _ in 0..3 {
670 if bytes.get(i) == Some(&b' ') {
671 i += 1;
672 } else {
673 break;
674 }
675 }
676
677 if bytes.get(i) != Some(&b':') {
678 return None;
679 }
680 if bytes.get(i + 1) == Some(&b':') {
682 return None;
683 }
684
685 match bytes.get(i + 1) {
687 Some(b' ') | Some(b'\t') => {
688 Some(i + 2)
690 }
691 _ => None,
692 }
693 }
694
695 fn can_start_item_at(text: &str, start: usize) -> bool {
696 if start >= text.len() {
697 return false;
698 }
699 let (_t0s, t0e, t1s) = line_bounds(text, start);
700 let term_line = &text[start..t0e];
701 if term_line.trim().is_empty() {
702 return false;
703 }
704 if t1s >= text.len() {
705 return false;
706 }
707 let (_d0s, d0e, _d1s) = line_bounds(text, t1s);
708 let def_line = &text[t1s..d0e];
709 def_marker_content_start(def_line).is_some()
710 }
711
712 let mut children: Vec<Node> = Vec::new();
714 let mut cursor = 0usize;
715 let mut parsed_any = false;
716
717 loop {
719 if cursor >= text.len() {
720 break;
721 }
722
723 let (term_start, term_end, after_term) = line_bounds(text, cursor);
725 let term_line = &text[term_start..term_end];
726
727 if term_line.trim().is_empty() {
730 break;
731 }
732
733 if after_term >= text.len() {
735 break;
736 }
737
738 let (def_line_start, def_line_end, _after_def_line) = line_bounds(text, after_term);
739 let first_def_line = &text[def_line_start..def_line_end];
740 if def_marker_content_start(first_def_line).is_none() {
741 break;
742 }
743
744 let term_start_span = input.take_from(term_start);
746 let (term_after_span, term_taken_span) = term_start_span.take_split(term_end - term_start);
747 let term_children = match crate::parser::inlines::parse_inlines_from_span(term_taken_span) {
748 Ok(children) => children,
749 Err(e) => {
750 log::warn!("Failed to parse inline elements in definition term: {}", e);
751 vec![Node {
752 kind: NodeKind::Text(term_taken_span.fragment().to_string()),
753 span: crate::parser::shared::opt_span(term_taken_span),
754 children: Vec::new(),
755 }]
756 }
757 };
758
759 children.push(Node {
760 kind: NodeKind::DefinitionTerm,
761 span: crate::parser::shared::opt_span_range(term_start_span, term_after_span),
762 children: term_children,
763 });
764
765 cursor = after_term;
767 while cursor < text.len() {
768 let (line_start, line_end, next_line_start) = line_bounds(text, cursor);
769 let line = &text[line_start..line_end];
770
771 let content_start_in_line = match def_marker_content_start(line) {
772 Some(i) => i,
773 None => break,
774 };
775
776 let def_block_start = line_start;
778 let mut def_block_end = next_line_start;
779
780 let mut raw_lines: Vec<&str> = Vec::new();
782 raw_lines.push(&line[content_start_in_line..]);
783
784 let mut scan = next_line_start;
785 while scan < text.len() {
786 let (ls, le, ln) = line_bounds(text, scan);
787 let l = &text[ls..le];
788
789 if def_marker_content_start(l).is_some() {
791 break;
792 }
793
794 if l.trim().is_empty() {
795 let mut look = ln;
798 let mut next_indent: Option<usize> = None;
799 while look < text.len() {
800 let (_pls, ple, pln) = line_bounds(text, look);
801 let pl = &text[look..ple];
802 if !pl.trim().is_empty() {
803 next_indent = Some(count_indent_columns(pl));
804 break;
805 }
806 look = pln;
807 }
808
809 if next_indent.unwrap_or(0) >= CONTINUATION_INDENT {
810 raw_lines.push("");
811 scan = ln;
812 def_block_end = scan;
813 continue;
814 }
815
816 break;
817 }
818
819 let indent = count_indent_columns(l);
820 if indent >= CONTINUATION_INDENT {
821 raw_lines.push(l);
822 scan = ln;
823 def_block_end = scan;
824 continue;
825 }
826
827 break;
828 }
829
830 let raw_body = raw_lines.join("\n");
831 let dedented = dedent_list_item_content(&raw_body, CONTINUATION_INDENT);
832
833 let mut def_state = ParserState::new();
835 def_state.push_block(BlockContext::new_list_item(CONTINUATION_INDENT));
836 let def_children = match parse_blocks_internal(&dedented, depth + 1, &mut def_state) {
837 Ok(doc) => doc.children,
838 Err(e) => {
839 log::warn!("Failed to parse definition description blocks: {}", e);
840 Vec::new()
841 }
842 };
843
844 let dd_start_span = input.take_from(def_block_start);
845 let dd_end_span = input.take_from(def_block_end);
846 children.push(Node {
847 kind: NodeKind::DefinitionDescription,
848 span: crate::parser::shared::opt_span_range(dd_start_span, dd_end_span),
849 children: def_children,
850 });
851
852 parsed_any = true;
853 cursor = def_block_end;
854 }
855
856 let mut scan = cursor;
858 while scan < text.len() {
859 let (_ls, le, ln) = line_bounds(text, scan);
860 let l = &text[scan..le];
861 if !l.trim().is_empty() {
862 break;
863 }
864 scan = ln;
865 }
866
867 if scan != cursor && can_start_item_at(text, scan) {
868 cursor = scan;
869 continue;
870 }
871
872 break;
873 }
874
875 if !parsed_any {
876 return None;
877 }
878
879 let (rest, _taken) = input.take_split(cursor);
880 let span = crate::parser::shared::opt_span_range(input, rest);
881 Some((
882 rest,
883 Node {
884 kind: NodeKind::DefinitionList,
885 span,
886 children,
887 },
888 ))
889}
890
891#[cfg(test)]
892mod tests {
893 use super::parse_blocks;
894 use crate::parser::ast::NodeKind;
895
896 #[test]
897 fn smoke_test_block_parser_handles_large_documents() {
898 let count = 250;
901 let mut input = String::new();
902 for i in 0..count {
903 input.push_str(&format!("Paragraph {i}\n\n"));
904 }
905
906 let doc = parse_blocks(&input).expect("parse_blocks failed");
907 assert_eq!(doc.children.len(), count);
908 assert!(matches!(
909 doc.children.last().unwrap().kind,
910 NodeKind::Paragraph
911 ));
912 }
913}