1pub mod shared;
10
11pub mod cm_blockquote_parser;
13pub mod cm_fenced_code_block_parser;
14pub mod cm_heading_parser;
15pub mod cm_html_blocks_parser;
16pub mod cm_indented_code_block_parser;
17pub mod cm_link_reference_parser;
18pub mod cm_list_parser;
19pub mod cm_paragraph_parser;
20pub mod cm_thematic_break_parser;
21pub mod gfm_admonitions;
22pub mod gfm_footnote_definition_parser;
23pub mod gfm_table_parser;
24pub mod marco_headerless_table_parser;
25pub mod marco_sliders_parser;
26pub mod marco_tab_blocks_parser;
27
28pub use shared::{dedent_list_item_content, to_parser_span, to_parser_span_range, GrammarSpan};
30
31use super::ast::Document;
32use crate::grammar::blocks as grammar;
33use crate::parser::ast::{Node, NodeKind};
34use nom::Input;
35
36#[derive(Debug, Clone, PartialEq)]
42enum BlockContextKind {
43 ListItem { content_indent: usize },
46}
47
48#[derive(Debug, Clone)]
50struct BlockContext {
51 kind: BlockContextKind,
52}
53
54impl BlockContext {
55 pub fn new_list_item(content_indent: usize) -> Self {
57 Self {
58 kind: BlockContextKind::ListItem { content_indent },
59 }
60 }
61
62 fn can_continue_at(&self, indent: usize) -> bool {
64 match self.kind {
65 BlockContextKind::ListItem { content_indent } => {
66 indent >= content_indent
68 }
69 }
70 }
71}
72
73struct ParserState {
79 blocks: Vec<BlockContext>,
80 allow_tab_blocks: bool,
81 allow_sliders: bool,
82}
83
84impl ParserState {
85 fn new() -> Self {
86 Self {
87 blocks: Vec::new(),
88 allow_tab_blocks: true,
89 allow_sliders: true,
90 }
91 }
92
93 fn new_with_tab_blocks(allow_tab_blocks: bool) -> Self {
94 Self {
95 blocks: Vec::new(),
96 allow_tab_blocks,
97 allow_sliders: true,
98 }
99 }
100
101 fn new_with_sliders(allow_sliders: bool) -> Self {
102 Self {
103 blocks: Vec::new(),
104 allow_tab_blocks: true,
105 allow_sliders,
106 }
107 }
108
109 pub fn push_block(&mut self, context: BlockContext) {
111 self.blocks.push(context);
112 }
113
114 fn pop_block(&mut self) -> Option<BlockContext> {
116 self.blocks.pop()
117 }
118
119 fn can_continue_at(&self, indent: usize) -> bool {
121 if let Some(context) = self.blocks.last() {
122 context.can_continue_at(indent)
123 } else {
124 false
126 }
127 }
128
129 fn close_blocks_until_indent(&mut self, indent: usize) -> usize {
132 let mut closed = 0;
133
134 while let Some(context) = self.blocks.last() {
136 if context.can_continue_at(indent) {
137 break;
139 } else {
140 self.blocks.pop();
142 closed += 1;
143 }
144 }
145
146 closed
147 }
148}
149
150pub fn parse_blocks(input: &str) -> Result<Document, Box<dyn std::error::Error>> {
156 let mut state = ParserState::new();
157 parse_blocks_internal(input, 0, &mut state)
158}
159
160fn parse_blocks_internal(
162 input: &str,
163 depth: usize,
164 state: &mut ParserState,
165) -> Result<Document, Box<dyn std::error::Error>> {
166 const MAX_DEPTH: usize = 100;
168 if depth > MAX_DEPTH {
169 log::warn!("Maximum recursion depth reached in block parser");
170 return Ok(Document::new());
171 }
172
173 log::debug!(
174 "Block parser input: {} bytes at depth {}, state depth: {}",
175 input.len(),
176 depth,
177 state.blocks.len()
178 );
179
180 let mut nodes = Vec::new();
181 let mut document = Document::new(); let mut remaining = GrammarSpan::new(input);
183
184 let max_iterations = input.lines().count().saturating_mul(8).max(1_000);
188 let mut iteration_count = 0;
189 let mut last_offset = 0;
190
191 while !remaining.fragment().is_empty() {
192 iteration_count += 1;
193 if iteration_count > max_iterations {
194 log::error!(
195 "Block parser exceeded iteration limit ({}) at depth {}",
196 max_iterations,
197 depth
198 );
199 break;
200 }
201
202 let current_offset = remaining.location_offset();
204 if current_offset == last_offset && iteration_count > 1 {
205 log::error!(
206 "Block parser not making progress at offset {}, depth {}",
207 current_offset,
208 depth
209 );
210 use nom::bytes::complete::take;
212 let skip_len = remaining
213 .fragment()
214 .chars()
215 .next()
216 .map(|c| c.len_utf8())
217 .unwrap_or(1);
218 if let Ok((rest, _)) =
219 take::<_, _, nom::error::Error<GrammarSpan>>(skip_len as u32)(remaining)
220 {
221 remaining = rest;
222 last_offset = remaining.location_offset();
223 continue;
224 }
225 break;
226 }
227 last_offset = current_offset;
228
229 let first_line_end = remaining
234 .fragment()
235 .find('\n')
236 .unwrap_or(remaining.fragment().len());
237 let first_line = &remaining.fragment()[..first_line_end];
238
239 if first_line.chars().all(|c| c == ' ' || c == '\t') {
242 let peek_offset = if first_line_end < remaining.fragment().len() {
244 first_line_end + 1
245 } else {
246 first_line_end
247 };
248
249 let mut next_nonblank_indent: Option<usize> = None;
251 let rest_of_input = &remaining.fragment()[peek_offset..];
252
253 for peek_line in rest_of_input.lines() {
254 if !peek_line.trim().is_empty() {
255 let mut indent = 0;
257 for ch in peek_line.chars() {
258 if ch == ' ' {
259 indent += 1;
260 } else if ch == '\t' {
261 indent += 4 - (indent % 4); } else {
263 break;
264 }
265 }
266 next_nonblank_indent = Some(indent);
267 break;
268 }
269 }
270
271 let should_continue = if let Some(next_indent) = next_nonblank_indent {
273 state.can_continue_at(next_indent)
275 } else {
276 false
278 };
279
280 if should_continue {
281 log::debug!(
284 "Blank line: continuing context at indent {:?}",
285 next_nonblank_indent
286 );
287
288 use nom::bytes::complete::take;
289 let skip_len = if first_line_end < remaining.fragment().len() {
290 first_line_end + 1 } else {
292 first_line_end
293 };
294
295 if let Ok((new_remaining, _)) =
296 take::<_, _, nom::error::Error<GrammarSpan>>(skip_len as u32)(remaining)
297 {
298 remaining = new_remaining;
299 continue;
300 } else {
301 break;
302 }
303 } else {
304 if let Some(next_indent) = next_nonblank_indent {
307 let closed = state.close_blocks_until_indent(next_indent);
308 log::debug!(
309 "Blank line: closed {} blocks due to indent {}",
310 closed,
311 next_indent
312 );
313 } else {
314 log::debug!("Blank line: end of input, closing all blocks");
316 while state.pop_block().is_some() {}
317 }
318
319 use nom::bytes::complete::take;
321 let skip_len = if first_line_end < remaining.fragment().len() {
322 first_line_end + 1
323 } else {
324 first_line_end
325 };
326
327 if let Ok((new_remaining, _)) =
328 take::<_, _, nom::error::Error<GrammarSpan>>(skip_len as u32)(remaining)
329 {
330 remaining = new_remaining;
331 continue;
332 } else {
333 break;
334 }
335 }
336 }
337
338 if let Ok((rest, content)) = grammar::html_special_tag(remaining) {
341 nodes.push(cm_html_blocks_parser::parse_html_block(content));
342 remaining = rest;
343 continue;
344 }
345
346 if let Ok((rest, content)) = grammar::html_comment(remaining) {
348 nodes.push(cm_html_blocks_parser::parse_html_block(content));
349 remaining = rest;
350 continue;
351 }
352
353 if let Ok((rest, content)) = grammar::html_processing_instruction(remaining) {
355 nodes.push(cm_html_blocks_parser::parse_html_block(content));
356 remaining = rest;
357 continue;
358 }
359
360 if let Ok((rest, content)) = grammar::html_declaration(remaining) {
362 nodes.push(cm_html_blocks_parser::parse_html_block(content));
363 remaining = rest;
364 continue;
365 }
366
367 if let Ok((rest, content)) = grammar::html_cdata(remaining) {
369 nodes.push(cm_html_blocks_parser::parse_html_block(content));
370 remaining = rest;
371 continue;
372 }
373
374 if let Ok((rest, content)) = grammar::html_block_tag(remaining) {
376 nodes.push(cm_html_blocks_parser::parse_html_block(content));
377 remaining = rest;
378 continue;
379 }
380
381 if let Ok((rest, content)) = grammar::html_complete_tag(remaining) {
384 nodes.push(cm_html_blocks_parser::parse_html_block(content));
385 remaining = rest;
386 continue;
387 } if let Ok((rest, (level, content))) = grammar::heading(remaining) {
389 nodes.push(cm_heading_parser::parse_atx_heading(level, content));
390 remaining = rest;
391 continue;
392 }
393
394 if let Ok((rest, (language, content))) = grammar::fenced_code_block(remaining) {
396 nodes.push(cm_fenced_code_block_parser::parse_fenced_code_block(
397 language, content,
398 ));
399 remaining = rest;
400 continue;
401 }
402
403 if let Ok((rest, content)) = grammar::thematic_break(remaining) {
405 nodes.push(cm_thematic_break_parser::parse_thematic_break(content));
406 remaining = rest;
407 continue;
408 }
409
410 if let Ok((rest, content)) = grammar::blockquote(remaining) {
412 let node =
413 cm_blockquote_parser::parse_blockquote(content, depth, |cleaned, new_depth| {
414 parse_blocks_internal(cleaned, new_depth, state)
415 })?;
416
417 nodes.push(node);
418 remaining = rest;
419 continue;
420 }
421
422 if let Ok((rest, content)) = grammar::indented_code_block(remaining) {
425 nodes.push(cm_indented_code_block_parser::parse_indented_code_block(
426 content,
427 ));
428 remaining = rest;
429 continue;
430 }
431
432 if let Ok((rest, items)) = grammar::list(remaining) {
435 let node = cm_list_parser::parse_list(
436 items,
437 depth,
438 parse_blocks_internal,
439 |content_indent| {
440 let mut item_state = ParserState::new();
441 item_state.push_block(BlockContext::new_list_item(content_indent));
442 item_state
443 },
444 )?;
445
446 nodes.push(node);
447 remaining = rest;
448 continue;
449 }
450
451 if state.allow_sliders {
455 let deck_start = remaining;
456 if let Ok((rest, deck)) = grammar::marco_slide_deck(remaining) {
457 let node = marco_sliders_parser::parse_marco_slide_deck(
458 deck,
459 deck_start,
460 rest,
461 depth,
462 |slide_body, new_depth| {
463 let mut slide_state = ParserState::new_with_sliders(false);
466 parse_blocks_internal(slide_body, new_depth, &mut slide_state)
467 },
468 )?;
469
470 nodes.push(node);
471 remaining = rest;
472 continue;
473 }
474 }
475
476 let full_start = remaining;
479 if let Ok((rest, (level, content))) = grammar::setext_heading(remaining) {
480 let full_end = rest;
481 nodes.push(cm_heading_parser::parse_setext_heading(
482 level, content, full_start, full_end,
483 ));
484 remaining = rest;
485 continue;
486 }
487
488 if let Some((rest, node)) =
491 gfm_footnote_definition_parser::parse_footnote_definition(remaining)
492 {
493 nodes.push(node);
494 remaining = rest;
495 continue;
496 }
497
498 if let Ok((rest, (label, url, title))) = grammar::link_reference_definition(remaining) {
499 cm_link_reference_parser::parse_link_reference(&mut document, &label, url, title);
500 remaining = rest;
501 continue;
502 }
503
504 let headerless_table_start = remaining;
510 if let Ok((rest, table)) = grammar::marco_headerless_table(remaining) {
511 nodes.push(marco_headerless_table_parser::parse_marco_headerless_table(
512 table,
513 headerless_table_start,
514 rest,
515 ));
516 remaining = rest;
517 continue;
518 }
519
520 let table_start = remaining;
521 if let Ok((rest, table)) = grammar::gfm_table(remaining) {
522 nodes.push(gfm_table_parser::parse_gfm_table(table, table_start, rest));
523 remaining = rest;
524 continue;
525 }
526
527 if state.allow_tab_blocks {
530 let tab_start = remaining;
531 if let Ok((rest, block)) = grammar::marco_tab_block(remaining) {
532 let node = marco_tab_blocks_parser::parse_marco_tab_block(
533 block,
534 tab_start,
535 rest,
536 depth,
537 |panel, new_depth| {
538 let mut panel_state = ParserState::new_with_tab_blocks(false);
542 parse_blocks_internal(panel, new_depth, &mut panel_state)
543 },
544 )?;
545
546 nodes.push(node);
547 remaining = rest;
548 continue;
549 }
550 }
551
552 if let Some((rest, node)) = parse_extended_definition_list(remaining, depth) {
555 nodes.push(node);
556 remaining = rest;
557 continue;
558 }
559
560 if let Ok((rest, content)) = grammar::paragraph(remaining) {
562 nodes.push(cm_paragraph_parser::parse_paragraph(content));
563 remaining = rest;
564 continue;
565 }
566
567 log::warn!(
570 "Could not parse block at offset {}, skipping character",
571 remaining.location_offset()
572 );
573 use nom::bytes::complete::take;
574 let skip_len = remaining
575 .fragment()
576 .chars()
577 .next()
578 .map(|c| c.len_utf8())
579 .unwrap_or(1);
580 if let Ok((rest, _)) =
581 take::<_, _, nom::error::Error<GrammarSpan>>(skip_len as u32)(remaining)
582 {
583 remaining = rest;
584 } else {
585 break;
586 }
587 }
588
589 log::info!("Parsed {} blocks", nodes.len());
590
591 document.children = nodes;
593 Ok(document)
594}
595
596fn parse_extended_definition_list<'a>(
617 input: GrammarSpan<'a>,
618 depth: usize,
619) -> Option<(GrammarSpan<'a>, Node)> {
620 let text = input.fragment();
622 if text.is_empty() {
623 return None;
624 }
625
626 const CONTINUATION_INDENT: usize = 2;
627
628 fn line_bounds(s: &str, start: usize) -> (usize, usize, usize) {
629 let rel_end = s[start..].find('\n').map(|i| start + i).unwrap_or(s.len());
631 let next = if rel_end < s.len() {
632 rel_end + 1
633 } else {
634 rel_end
635 };
636 (start, rel_end, next)
637 }
638
639 fn count_indent_columns(line: &str) -> usize {
640 let mut indent = 0usize;
642 for ch in line.chars() {
643 if ch == ' ' {
644 indent += 1;
645 } else if ch == '\t' {
646 indent += 4 - (indent % 4);
647 } else {
648 break;
649 }
650 }
651 indent
652 }
653
654 fn def_marker_content_start(line: &str) -> Option<usize> {
655 let bytes = line.as_bytes();
657 let mut i = 0usize;
658 for _ in 0..3 {
659 if bytes.get(i) == Some(&b' ') {
660 i += 1;
661 } else {
662 break;
663 }
664 }
665
666 if bytes.get(i) != Some(&b':') {
667 return None;
668 }
669 if bytes.get(i + 1) == Some(&b':') {
671 return None;
672 }
673
674 match bytes.get(i + 1) {
676 Some(b' ') | Some(b'\t') => {
677 Some(i + 2)
679 }
680 _ => None,
681 }
682 }
683
684 fn can_start_item_at(text: &str, start: usize) -> bool {
685 if start >= text.len() {
686 return false;
687 }
688 let (_t0s, t0e, t1s) = line_bounds(text, start);
689 let term_line = &text[start..t0e];
690 if term_line.trim().is_empty() {
691 return false;
692 }
693 if t1s >= text.len() {
694 return false;
695 }
696 let (_d0s, d0e, _d1s) = line_bounds(text, t1s);
697 let def_line = &text[t1s..d0e];
698 def_marker_content_start(def_line).is_some()
699 }
700
701 let mut children: Vec<Node> = Vec::new();
703 let mut cursor = 0usize;
704 let mut parsed_any = false;
705
706 loop {
708 if cursor >= text.len() {
709 break;
710 }
711
712 let (term_start, term_end, after_term) = line_bounds(text, cursor);
714 let term_line = &text[term_start..term_end];
715
716 if term_line.trim().is_empty() {
719 break;
720 }
721
722 if after_term >= text.len() {
724 break;
725 }
726
727 let (def_line_start, def_line_end, _after_def_line) = line_bounds(text, after_term);
728 let first_def_line = &text[def_line_start..def_line_end];
729 if def_marker_content_start(first_def_line).is_none() {
730 break;
731 }
732
733 let term_start_span = input.take_from(term_start);
735 let (term_after_span, term_taken_span) = term_start_span.take_split(term_end - term_start);
736 let term_children = match crate::parser::inlines::parse_inlines_from_span(term_taken_span) {
737 Ok(children) => children,
738 Err(e) => {
739 log::warn!("Failed to parse inline elements in definition term: {}", e);
740 vec![Node {
741 kind: NodeKind::Text(term_taken_span.fragment().to_string()),
742 span: Some(crate::parser::shared::to_parser_span(term_taken_span)),
743 children: Vec::new(),
744 }]
745 }
746 };
747
748 children.push(Node {
749 kind: NodeKind::DefinitionTerm,
750 span: Some(crate::parser::shared::to_parser_span_range(
751 term_start_span,
752 term_after_span,
753 )),
754 children: term_children,
755 });
756
757 cursor = after_term;
759 while cursor < text.len() {
760 let (line_start, line_end, next_line_start) = line_bounds(text, cursor);
761 let line = &text[line_start..line_end];
762
763 let content_start_in_line = match def_marker_content_start(line) {
764 Some(i) => i,
765 None => break,
766 };
767
768 let def_block_start = line_start;
770 let mut def_block_end = next_line_start;
771
772 let mut raw_lines: Vec<&str> = Vec::new();
774 raw_lines.push(&line[content_start_in_line..]);
775
776 let mut scan = next_line_start;
777 while scan < text.len() {
778 let (ls, le, ln) = line_bounds(text, scan);
779 let l = &text[ls..le];
780
781 if def_marker_content_start(l).is_some() {
783 break;
784 }
785
786 if l.trim().is_empty() {
787 let mut look = ln;
790 let mut next_indent: Option<usize> = None;
791 while look < text.len() {
792 let (_pls, ple, pln) = line_bounds(text, look);
793 let pl = &text[look..ple];
794 if !pl.trim().is_empty() {
795 next_indent = Some(count_indent_columns(pl));
796 break;
797 }
798 look = pln;
799 }
800
801 if next_indent.unwrap_or(0) >= CONTINUATION_INDENT {
802 raw_lines.push("");
803 scan = ln;
804 def_block_end = scan;
805 continue;
806 }
807
808 break;
809 }
810
811 let indent = count_indent_columns(l);
812 if indent >= CONTINUATION_INDENT {
813 raw_lines.push(l);
814 scan = ln;
815 def_block_end = scan;
816 continue;
817 }
818
819 break;
820 }
821
822 let raw_body = raw_lines.join("\n");
823 let dedented = dedent_list_item_content(&raw_body, CONTINUATION_INDENT);
824
825 let mut def_state = ParserState::new();
827 def_state.push_block(BlockContext::new_list_item(CONTINUATION_INDENT));
828 let def_children = match parse_blocks_internal(&dedented, depth + 1, &mut def_state) {
829 Ok(doc) => doc.children,
830 Err(e) => {
831 log::warn!("Failed to parse definition description blocks: {}", e);
832 Vec::new()
833 }
834 };
835
836 let dd_start_span = input.take_from(def_block_start);
837 let dd_end_span = input.take_from(def_block_end);
838 children.push(Node {
839 kind: NodeKind::DefinitionDescription,
840 span: Some(crate::parser::shared::to_parser_span_range(
841 dd_start_span,
842 dd_end_span,
843 )),
844 children: def_children,
845 });
846
847 parsed_any = true;
848 cursor = def_block_end;
849 }
850
851 let mut scan = cursor;
853 while scan < text.len() {
854 let (_ls, le, ln) = line_bounds(text, scan);
855 let l = &text[scan..le];
856 if !l.trim().is_empty() {
857 break;
858 }
859 scan = ln;
860 }
861
862 if scan != cursor && can_start_item_at(text, scan) {
863 cursor = scan;
864 continue;
865 }
866
867 break;
868 }
869
870 if !parsed_any {
871 return None;
872 }
873
874 let (rest, _taken) = input.take_split(cursor);
875 let span = crate::parser::shared::to_parser_span_range(input, rest);
876 Some((
877 rest,
878 Node {
879 kind: NodeKind::DefinitionList,
880 span: Some(span),
881 children,
882 },
883 ))
884}
885
886#[cfg(test)]
887mod tests {
888 use super::parse_blocks;
889 use crate::parser::ast::NodeKind;
890
891 #[test]
892 fn smoke_test_block_parser_handles_large_documents() {
893 let count = 250;
896 let mut input = String::new();
897 for i in 0..count {
898 input.push_str(&format!("Paragraph {i}\n\n"));
899 }
900
901 let doc = parse_blocks(&input).expect("parse_blocks failed");
902 assert_eq!(doc.children.len(), count);
903 assert!(matches!(
904 doc.children.last().unwrap().kind,
905 NodeKind::Paragraph
906 ));
907 }
908}