1use crate::{Diagnostic, DiagnosticSeverity, Error, HeadingBlock, Result, TocEntry};
140use base64::{Engine, engine::general_purpose::STANDARD as B64};
141use sha2::{Digest, Sha256};
142const FALLBACK_WINDOW_LINES: usize = 200;
144use std::collections::VecDeque;
145use tree_sitter::{Node, Parser, TreeCursor};
146
147pub struct MarkdownParser {
173 parser: Parser,
178}
179
180impl MarkdownParser {
181 pub fn new() -> Result<Self> {
217 let mut parser = Parser::new();
218 parser
219 .set_language(&tree_sitter_md::LANGUAGE.into())
220 .map_err(|e| Error::Parse(format!("Failed to set language: {e}")))?;
221
222 Ok(Self { parser })
223 }
224
225 pub fn parse(&mut self, text: &str) -> Result<ParseResult> {
303 let tree = self
304 .parser
305 .parse(text, None)
306 .ok_or_else(|| Error::Parse("Failed to parse markdown".into()))?;
307
308 let root = tree.root_node();
309 let mut diagnostics = Vec::new();
310 let mut heading_blocks = Vec::new();
311 let mut toc = Vec::new();
312
313 if root.has_error() {
314 diagnostics.push(Diagnostic {
315 severity: DiagnosticSeverity::Warn,
316 message: "Parse tree contains errors, using fallback parsing".into(),
317 line: None,
318 });
319 }
320
321 let mut cursor = root.walk();
322 Self::extract_headings(&mut cursor, text, &mut heading_blocks, &mut toc);
323
324 if heading_blocks.is_empty() {
325 diagnostics.push(Diagnostic {
326 severity: DiagnosticSeverity::Warn,
327 message: "No headings found in document".into(),
328 line: Some(1),
329 });
330
331 let total_lines = text.lines().count();
334 if total_lines <= FALLBACK_WINDOW_LINES {
335 heading_blocks.push(HeadingBlock {
336 path: vec!["Document".into()],
337 content: text.to_string(),
338 start_line: 1,
339 end_line: total_lines,
340 });
341 } else {
342 let mut start = 1usize;
343 let mut current = String::new();
344 let mut count = 0usize;
345 for line in text.lines() {
346 if count > 0 {
347 current.push('\n');
348 }
349 current.push_str(line);
350 count += 1;
351 if count == FALLBACK_WINDOW_LINES {
352 let end_line = start + count - 1;
353 heading_blocks.push(HeadingBlock {
354 path: vec!["Document".into()],
355 content: std::mem::take(&mut current),
356 start_line: start,
357 end_line,
358 });
359 start = end_line + 1;
360 count = 0;
361 }
362 }
363 if !current.is_empty() {
364 let end_line = start + count - 1;
365 heading_blocks.push(HeadingBlock {
366 path: vec!["Document".into()],
367 content: current,
368 start_line: start,
369 end_line,
370 });
371 }
372 }
373 }
374
375 let line_count = text.lines().count();
376
377 Ok(ParseResult {
378 heading_blocks,
379 toc,
380 diagnostics,
381 line_count,
382 })
383 }
384
385 fn extract_headings(
386 cursor: &mut TreeCursor,
387 text: &str,
388 blocks: &mut Vec<HeadingBlock>,
389 toc: &mut Vec<TocEntry>,
390 ) {
391 #[derive(Debug)]
393 struct HeadingInfo {
394 level: usize,
395 text: String,
396 byte_start: usize,
397 line_start: usize,
398 }
399
400 let mut headings = Vec::new();
401
402 Self::walk_tree(cursor, text, |node| {
404 if node.kind() == "atx_heading" {
405 let level = Self::get_heading_level(node, text);
406 let heading_text = Self::get_heading_text(node, text);
407 let line_start = node.start_position().row;
408
409 headings.push(HeadingInfo {
410 level,
411 text: heading_text,
412 byte_start: node.byte_range().start,
413 line_start,
414 });
415 }
416 });
417
418 if headings.is_empty() {
420 return;
421 }
422
423 headings.sort_by_key(|h| h.byte_start);
425
426 let mut current_path = Vec::new();
428 let mut stack: VecDeque<usize> = VecDeque::new();
429 let mut baseline_level: Option<usize> = None;
430
431 for i in 0..headings.len() {
432 let heading = &headings[i];
433
434 let trimmed = heading.text.trim();
436 if heading.level == 1 && trimmed.starts_with("404") {
437 current_path.clear();
439 stack.clear();
440 continue;
441 }
442
443 if baseline_level.is_none_or(|level| heading.level < level) {
444 baseline_level = Some(heading.level);
445 }
446 let baseline = baseline_level.unwrap_or(1);
447 let effective_level = heading
448 .level
449 .saturating_sub(baseline.saturating_sub(1))
450 .max(1);
451
452 while stack.len() >= effective_level {
453 stack.pop_back();
454 current_path.pop();
455 }
456 current_path.push(heading.text.clone());
457 stack.push_back(effective_level);
458
459 let content_start = heading.byte_start;
461 let content_end = if i + 1 < headings.len() {
462 headings[i + 1].byte_start
463 } else {
464 text.len()
465 };
466
467 let content = &text[content_start..content_end];
469
470 let start_line = heading.line_start + 1; let end_line = if i + 1 < headings.len() {
473 headings[i + 1].line_start } else {
475 text.lines().count()
476 };
477
478 blocks.push(HeadingBlock {
480 path: current_path.clone(),
481 content: content.to_string(),
482 start_line,
483 end_line,
484 });
485
486 let anchor = Some(Self::compute_anchor(¤t_path, &heading.text, content));
488
489 let entry = TocEntry {
491 heading_path: current_path.clone(),
492 lines: if end_line > start_line {
493 format!("{start_line}-{end_line}")
494 } else {
495 format!("{start_line}")
496 },
497 anchor,
498 children: Vec::new(),
499 };
500
501 Self::add_to_toc(toc, entry, stack.len());
502 }
503 }
504
505 fn compute_anchor(_path: &[String], heading_text: &str, _content: &str) -> String {
506 let mut hasher = Sha256::new();
507 hasher.update(heading_text.trim().to_lowercase().as_bytes());
509 let digest = hasher.finalize();
510 let full = B64.encode(digest);
511 full[..22.min(full.len())].to_string()
513 }
514
515 fn walk_tree<F>(cursor: &mut TreeCursor, _text: &str, mut callback: F)
516 where
517 F: FnMut(Node),
518 {
519 loop {
520 let node = cursor.node();
521 callback(node);
522
523 if cursor.goto_first_child() {
524 continue;
525 }
526
527 if cursor.goto_next_sibling() {
528 continue;
529 }
530
531 loop {
532 if !cursor.goto_parent() {
533 return;
534 }
535 if cursor.goto_next_sibling() {
536 break;
537 }
538 }
539 }
540 }
541
542 fn get_heading_level(node: Node, _text: &str) -> usize {
543 for child in node.children(&mut node.walk()) {
544 if child.kind() == "atx_h1_marker" {
545 return 1;
546 } else if child.kind() == "atx_h2_marker" {
547 return 2;
548 } else if child.kind() == "atx_h3_marker" {
549 return 3;
550 } else if child.kind() == "atx_h4_marker" {
551 return 4;
552 } else if child.kind() == "atx_h5_marker" {
553 return 5;
554 } else if child.kind() == "atx_h6_marker" {
555 return 6;
556 }
557 }
558 1
559 }
560
561 fn get_heading_text(node: Node, text: &str) -> String {
562 for child in node.children(&mut node.walk()) {
563 if child.kind().contains("heading") && child.kind().contains("content") {
564 return text[child.byte_range()].trim().to_string();
565 }
566 }
567
568 let full_text = &text[node.byte_range()];
569 full_text.trim_start_matches('#').trim().to_string()
570 }
571
572 fn add_to_toc(toc: &mut Vec<TocEntry>, entry: TocEntry, depth: usize) {
573 if depth == 1 {
574 toc.push(entry);
575 } else if let Some(parent) = toc.last_mut() {
576 Self::add_to_toc_recursive(&mut parent.children, entry, depth - 1);
577 }
578 }
579
580 fn add_to_toc_recursive(toc: &mut Vec<TocEntry>, entry: TocEntry, depth: usize) {
581 if depth == 1 {
582 toc.push(entry);
583 } else if let Some(parent) = toc.last_mut() {
584 Self::add_to_toc_recursive(&mut parent.children, entry, depth - 1);
585 }
586 }
587}
588
589#[derive(Clone)]
651pub struct ParseResult {
652 pub heading_blocks: Vec<HeadingBlock>,
665
666 pub toc: Vec<TocEntry>,
678
679 pub diagnostics: Vec<Diagnostic>,
692
693 pub line_count: usize,
699}
700
701#[cfg(test)]
705#[allow(
706 clippy::unwrap_used,
707 clippy::unnecessary_wraps,
708 clippy::format_push_string,
709 clippy::disallowed_macros
710)]
711mod tests {
712 use super::*;
713 use proptest::prelude::*;
714
715 fn create_test_parser() -> MarkdownParser {
717 MarkdownParser::new().expect("Failed to create parser")
718 }
719
720 #[test]
721 fn test_anchor_stability_when_section_moves() {
722 let mut parser = create_test_parser();
723
724 let doc_v1 = "# Intro\n\nPrelude.\n\n## Section A\n\nAlpha content line 1.\nAlpha content line 2.\n\n## Section B\n\nBeta content.\n";
725
726 let result_v1 = parser.parse(doc_v1).expect("parse v1");
727 #[allow(clippy::items_after_statements)]
728 fn find<'a>(entries: &'a [TocEntry], name: &str) -> Option<&'a TocEntry> {
729 for e in entries {
730 if e.heading_path.last().is_some_and(|h| h == name) {
731 return Some(e);
732 }
733 if let Some(found) = find(&e.children, name) {
734 return Some(found);
735 }
736 }
737 None
738 }
739 let a_v1 = find(&result_v1.toc, "Section A").expect("section A in v1");
740 let anchor_v1 = a_v1.anchor.clone().expect("anchor v1");
741 let lines_v1 = a_v1.lines.clone();
742
743 let doc_v2 = "# Intro\n\nPrelude.\n\n## Section B\n\nBeta content.\n\n## Section A\n\nAlpha content line 1.\nAlpha content line 2.\n";
745 let result_v2 = parser.parse(doc_v2).expect("parse v2");
746 let a_v2 = find(&result_v2.toc, "Section A").expect("section A in v2");
747 let anchor_v2 = a_v2.anchor.clone().expect("anchor v2");
748 let lines_v2 = a_v2.lines.clone();
749
750 assert_eq!(anchor_v1, anchor_v2, "anchor stable across moves");
752 assert_ne!(lines_v1, lines_v2, "lines should reflect new position");
753 }
754
755 #[test]
756 fn test_skips_placeholder_404_headings() -> Result<()> {
757 let mut parser = create_test_parser();
758
759 let doc = r"# 404
760
761Check the URL.
762
763## Actual Section
764
765Real content lives here.
766
767### Nested Detail
768
769Additional context.
770
771## Follow Up
772
773More guidance.
774";
775
776 let result = parser.parse(doc)?;
777
778 assert_eq!(
779 result.toc.len(),
780 2,
781 "top-level entries should ignore 404 headings"
782 );
783 assert!(
784 result.toc.iter().all(|entry| entry
785 .heading_path
786 .iter()
787 .all(|component| !component.starts_with("404"))),
788 "toc should not contain placeholder 404 entries"
789 );
790 assert_eq!(
791 result.heading_blocks.len(),
792 3,
793 "children under 404 should remain accessible"
794 );
795 assert_eq!(result.heading_blocks[0].path[0], "Actual Section");
796
797 Ok(())
798 }
799
800 fn simple_markdown() -> &'static str {
801 r"# Main Heading
802
803This is some content under the main heading.
804
805## Sub Heading
806
807More content here.
808
809### Deep Heading
810
811Even deeper content.
812
813## Another Sub
814
815Final content.
816"
817 }
818
819 fn complex_markdown() -> &'static str {
820 r#"# Getting Started
821
822Welcome to our documentation!
823
824## Installation
825
826Run the following command:
827
828```bash
829npm install
830```
831
832### Requirements
833
834- Node.js 16+
835- npm 7+
836
837## Usage
838
839Here's how to use it:
840
8411. First step
8422. Second step
843
844### Advanced Usage
845
846For advanced users:
847
848#### Configuration
849
850Edit the config file:
851
852```json
853{
854 "key": "value"
855}
856```
857
858## Troubleshooting
859
860Common issues:
861
862- Issue 1
863- Issue 2
864"#
865 }
866
867 fn malformed_markdown() -> &'static str {
868 r"# Broken Heading
869## Missing content
870
871### Unmatched brackets ][
872
873Content with `unclosed code
874
875> Broken quote
876>> Nested broken quote
877
878* List item
879 * Nested without proper spacing
880* Another item
881
882```
883Unclosed code block
884"
885 }
886
887 #[test]
888 fn test_parser_creation() {
889 let result = MarkdownParser::new();
892
893 assert!(result.is_ok());
895 }
896
897 #[test]
898 fn test_parse_simple_markdown() -> Result<()> {
899 let mut parser = create_test_parser();
901 let markdown = simple_markdown();
902
903 let result = parser.parse(markdown)?;
905
906 assert!(!result.heading_blocks.is_empty());
908 assert!(!result.toc.is_empty());
909 assert_eq!(result.line_count, markdown.lines().count());
910
911 let main_heading = result
913 .heading_blocks
914 .iter()
915 .find(|block| block.path.contains(&"Main Heading".to_string()));
916 assert!(main_heading.is_some());
917
918 let sub_heading = result
920 .heading_blocks
921 .iter()
922 .find(|block| block.path.contains(&"Sub Heading".to_string()));
923 assert!(sub_heading.is_some());
924
925 Ok(())
926 }
927
928 #[test]
929 fn test_parse_complex_markdown_structure() -> Result<()> {
930 let mut parser = create_test_parser();
932 let markdown = complex_markdown();
933
934 let result = parser.parse(markdown)?;
936
937 assert!(result.heading_blocks.len() >= 5); let headings: Vec<_> = result
942 .heading_blocks
943 .iter()
944 .flat_map(|block| &block.path)
945 .collect();
946
947 assert!(headings.iter().any(|h| h.contains("Getting Started")));
948 assert!(headings.iter().any(|h| h.contains("Installation")));
949 assert!(headings.iter().any(|h| h.contains("Requirements")));
950 assert!(headings.iter().any(|h| h.contains("Configuration")));
951
952 assert!(!result.toc.is_empty());
954 let top_level = &result.toc[0];
955 assert!(
956 top_level
957 .heading_path
958 .contains(&"Getting Started".to_string())
959 );
960
961 Ok(())
962 }
963
964 #[test]
965 fn test_parse_malformed_markdown() -> Result<()> {
966 let mut parser = create_test_parser();
968 let markdown = malformed_markdown();
969
970 let result = parser.parse(markdown)?;
972
973 assert!(!result.heading_blocks.is_empty()); Ok(())
980 }
981
982 #[test]
983 fn test_parse_empty_document() -> Result<()> {
984 let mut parser = create_test_parser();
986 let empty = "";
987
988 let result = parser.parse(empty)?;
990
991 assert_eq!(result.line_count, 0);
993 assert!(result.heading_blocks.len() <= 1); assert!(
995 result
996 .diagnostics
997 .iter()
998 .any(|d| d.message.contains("No headings found")
999 || d.severity == DiagnosticSeverity::Warn)
1000 );
1001
1002 Ok(())
1003 }
1004
1005 #[test]
1006 fn test_parse_document_without_headings() -> Result<()> {
1007 let mut parser = create_test_parser();
1009 let no_headings = r"This is just plain text.
1010
1011With multiple paragraphs.
1012
1013And some more content.
1014
1015But no headings at all.
1016";
1017
1018 let result = parser.parse(no_headings)?;
1020
1021 assert_eq!(result.heading_blocks.len(), 1);
1023 let block = &result.heading_blocks[0];
1024 assert_eq!(block.path, vec!["Document".to_string()]);
1025 assert_eq!(block.content.trim(), no_headings.trim());
1026
1027 assert!(
1029 result
1030 .diagnostics
1031 .iter()
1032 .any(|d| d.message.contains("No headings found"))
1033 );
1034
1035 Ok(())
1036 }
1037
1038 #[test]
1039 fn test_windowed_segmentation_for_large_unstructured() -> Result<()> {
1040 let mut parser = create_test_parser();
1042 let total = FALLBACK_WINDOW_LINES * 2 + 25; let doc = (1..=total)
1044 .map(|i| format!("line {i}"))
1045 .collect::<Vec<_>>()
1046 .join("\n");
1047
1048 let result = parser.parse(&doc)?;
1050
1051 assert_eq!(result.heading_blocks.len(), 3);
1053 for b in &result.heading_blocks {
1054 assert_eq!(b.path, vec!["Document".to_string()]);
1055 assert!(b.start_line >= 1);
1056 assert!(b.end_line <= total);
1057 }
1058 assert_eq!(result.heading_blocks.last().unwrap().end_line, total);
1059
1060 Ok(())
1061 }
1062
1063 #[test]
1064 fn test_heading_level_detection() -> Result<()> {
1065 let mut parser = create_test_parser();
1067 let multilevel = r"# Level 1
1068
1069## Level 2
1070
1071### Level 3
1072
1073#### Level 4
1074
1075##### Level 5
1076
1077###### Level 6
1078";
1079
1080 let result = parser.parse(multilevel)?;
1082
1083 assert!(result.heading_blocks.len() >= 6);
1085
1086 let paths: Vec<_> = result
1088 .heading_blocks
1089 .iter()
1090 .map(|block| block.path.len())
1091 .collect();
1092
1093 assert!(paths.contains(&1)); assert!(paths.contains(&2)); assert!(paths.iter().any(|&len| len >= 3)); Ok(())
1099 }
1100
1101 #[test]
1102 fn test_heading_text_extraction() -> Result<()> {
1103 let mut parser = create_test_parser();
1105 let formatted_headings = r"# **Bold Heading**
1106
1107## _Italic Heading_
1108
1109### `Code in Heading`
1110
1111#### Heading with [Link](http://example.com)
1112
1113##### Heading with **bold** and _italic_
1114";
1115
1116 let result = parser.parse(formatted_headings)?;
1118
1119 let heading_texts: Vec<_> = result
1121 .heading_blocks
1122 .iter()
1123 .flat_map(|block| &block.path)
1124 .collect();
1125
1126 assert!(heading_texts.iter().any(|h| h.contains("Bold Heading")));
1128 assert!(heading_texts.iter().any(|h| h.contains("Italic Heading")));
1129 assert!(heading_texts.iter().any(|h| h.contains("Code in Heading")));
1130
1131 Ok(())
1132 }
1133
1134 #[test]
1135 fn test_content_extraction() -> Result<()> {
1136 let mut parser = create_test_parser();
1138 let content_markdown = r"# Section A
1139
1140This is content for section A.
1141It spans multiple lines.
1142
1143## Subsection A1
1144
1145More specific content here.
1146
1147# Section B
1148
1149Different content for section B.
1150";
1151
1152 let result = parser.parse(content_markdown)?;
1154
1155 let section_a = result
1157 .heading_blocks
1158 .iter()
1159 .find(|block| block.path.contains(&"Section A".to_string()))
1160 .expect("Section A should be found");
1161
1162 assert!(section_a.content.contains("This is content for section A"));
1163 assert!(section_a.content.contains("multiple lines"));
1164
1165 let section_b = result
1166 .heading_blocks
1167 .iter()
1168 .find(|block| block.path.contains(&"Section B".to_string()))
1169 .expect("Section B should be found");
1170
1171 assert!(
1172 section_b
1173 .content
1174 .contains("Different content for section B")
1175 );
1176
1177 Ok(())
1178 }
1179
1180 #[test]
1181 fn test_line_number_tracking() -> Result<()> {
1182 let mut parser = create_test_parser();
1184 let numbered_content =
1185 "Line 1\n# Heading at line 2\nLine 3\nLine 4\n## Sub at line 5\nLine 6";
1186
1187 let result = parser.parse(numbered_content)?;
1189
1190 assert_eq!(result.line_count, 6);
1192
1193 let heading_block = result
1195 .heading_blocks
1196 .iter()
1197 .find(|block| block.path.contains(&"Heading at line 2".to_string()));
1198
1199 if let Some(block) = heading_block {
1200 assert!(block.start_line >= 1);
1202 assert!(block.end_line <= result.line_count);
1203 assert!(block.start_line <= block.end_line);
1204 }
1205
1206 Ok(())
1207 }
1208
1209 #[test]
1210 fn test_toc_generation() -> Result<()> {
1211 let mut parser = create_test_parser();
1213 let hierarchical = r"# Top Level
1214
1215## First Sub
1216### Deep Sub 1
1217### Deep Sub 2
1218
1219## Second Sub
1220### Another Deep
1221#### Very Deep
1222
1223# Another Top
1224";
1225
1226 let result = parser.parse(hierarchical)?;
1228
1229 assert!(!result.toc.is_empty());
1231
1232 assert!(!result.toc.is_empty());
1234
1235 let first_top = &result.toc[0];
1237 assert!(first_top.heading_path.contains(&"Top Level".to_string()));
1238
1239 if !first_top.children.is_empty() {
1241 let first_sub = &first_top.children[0];
1242 assert!(first_sub.heading_path.len() >= 2); }
1244
1245 Ok(())
1246 }
1247
1248 proptest! {
1250 #[test]
1255 fn test_parser_never_panics_on_arbitrary_input(
1256 content in prop::string::string_regex("[\\x20-\\x7E\\n\\r\\t]{0,500}").unwrap()
1257 ) {
1258 let mut parser = create_test_parser();
1259
1260 let result = parser.parse(&content);
1262
1263 if let Ok(parse_result) = result {
1265 prop_assert!(parse_result.line_count == content.lines().count());
1266 prop_assert!(!parse_result.heading_blocks.is_empty()); } else {
1268 }
1270 }
1271
1272 #[test]
1273 fn test_line_count_accuracy(
1274 lines in prop::collection::vec(
1275 prop::string::string_regex("[\\x20-\\x7E]{0,100}").unwrap(),
1276 0..50
1277 )
1278 ) {
1279 let content = lines.join("\n");
1280 let mut parser = create_test_parser();
1281 let expected_lines = if content.is_empty() {
1284 0
1285 } else {
1286 content.lines().count()
1289 };
1290
1291 if let Ok(result) = parser.parse(&content) {
1292 prop_assert_eq!(result.line_count, expected_lines);
1293 }
1294 }
1295
1296 #[test]
1297 fn test_single_heading_parsing(heading_text in r"[a-zA-Z][a-zA-Z0-9 ]{2,30}") {
1298 let mut parser = create_test_parser();
1299 let markdown = format!("# {heading_text}");
1300
1301 let trimmed = heading_text.trim();
1303 if trimmed.is_empty() || trimmed.len() < 2 {
1304 return Ok(());
1306 }
1307
1308 if let Ok(result) = parser.parse(&markdown) {
1309 prop_assert!(!result.heading_blocks.is_empty());
1311
1312 if !result.toc.is_empty() {
1314 let has_heading = result.heading_blocks.iter()
1315 .any(|block| block.path.iter().any(|p| p.contains(trimmed)));
1316 prop_assert!(has_heading);
1317 }
1318 }
1319 }
1320
1321 #[test]
1322 fn test_heading_level_detection_consistency(
1323 levels in prop::collection::vec(1u8..=6, 1..10)
1324 ) {
1325 let mut parser = create_test_parser();
1326
1327 let mut markdown = String::new();
1329 let mut expected_path_lens = Vec::new();
1330
1331 for (i, level) in levels.iter().enumerate() {
1332 let heading_text = format!("Heading {}", i + 1);
1333 let heading_line = format!("{} {}\n\nContent for heading {}\n\n",
1334 "#".repeat(*level as usize),
1335 heading_text,
1336 i + 1);
1337 markdown.push_str(&heading_line);
1338 expected_path_lens.push(*level as usize);
1339 }
1340
1341 if let Ok(result) = parser.parse(&markdown) {
1342 prop_assert!(result.heading_blocks.len() >= levels.len().min(1));
1344
1345 for (i, expected_depth) in expected_path_lens.iter().enumerate() {
1347 if i < result.heading_blocks.len() {
1348 let actual_depth = result.heading_blocks[i].path.len();
1349 prop_assert!(actual_depth <= *expected_depth);
1351 prop_assert!(actual_depth >= 1);
1352 }
1353 }
1354 }
1355 }
1356
1357 #[test]
1358 fn test_unicode_content_preservation(
1359 content in r"[\u{0080}-\u{FFFF}]{1,100}"
1360 ) {
1361 let mut parser = create_test_parser();
1362 let markdown = format!("# Unicode Test\n\n{content}");
1363
1364 if let Ok(result) = parser.parse(&markdown) {
1365 let has_unicode = result.heading_blocks.iter()
1367 .any(|block| block.content.contains(&content));
1368 prop_assert!(has_unicode, "Unicode content should be preserved");
1369
1370 prop_assert_eq!(result.line_count, markdown.lines().count());
1372 }
1373 }
1374
1375 #[test]
1376 fn test_mixed_line_endings(
1377 line_ending in prop_oneof![Just("\n"), Just("\r\n"), Just("\r")]
1378 ) {
1379 let mut parser = create_test_parser();
1380 let content_lines = ["# Main Heading",
1381 "",
1382 "This is content.",
1383 "",
1384 "## Sub Heading",
1385 "",
1386 "More content here."];
1387
1388 let markdown = content_lines.join(line_ending);
1389
1390 if let Ok(result) = parser.parse(&markdown) {
1391 prop_assert!(!result.heading_blocks.is_empty());
1393
1394 let main_heading = result.heading_blocks.iter()
1396 .any(|block| block.path.iter().any(|p| p.contains("Main Heading")));
1397 let sub_heading = result.heading_blocks.iter()
1398 .any(|block| block.path.iter().any(|p| p.contains("Sub Heading")));
1399
1400 prop_assert!(main_heading || sub_heading, "Should find at least one heading");
1401 }
1402 }
1403
1404 #[test]
1405 fn test_deeply_nested_structure(depth in 1usize..20) {
1406 let mut parser = create_test_parser();
1407 let mut markdown = String::new();
1408
1409 for level in 1..=depth.min(6) {
1411 let heading = format!("{} Level {} Heading\n\nContent at level {}.\n\n",
1412 "#".repeat(level), level, level);
1413 markdown.push_str(&heading);
1414 }
1415
1416 if let Ok(result) = parser.parse(&markdown) {
1417 prop_assert!(!result.heading_blocks.is_empty());
1419 prop_assert!(!result.toc.is_empty());
1420
1421 if let Some(deepest) = result.heading_blocks.iter()
1423 .max_by_key(|block| block.path.len()) {
1424 prop_assert!(deepest.path.len() <= depth.min(6));
1425 }
1426 }
1427 }
1428
1429 #[test]
1430 fn test_large_content_blocks(
1431 block_size in 100usize..5000,
1432 num_blocks in 1usize..10
1433 ) {
1434 let mut parser = create_test_parser();
1435 let mut markdown = String::new();
1436
1437 for i in 0..num_blocks {
1438 markdown.push_str(&format!("# Heading {}\n\n", i + 1));
1439
1440 let content_line = format!("This is line {i} of content. ");
1442 let large_content = content_line.repeat(block_size / content_line.len());
1443 markdown.push_str(&large_content);
1444 markdown.push_str("\n\n");
1445 }
1446
1447 if let Ok(result) = parser.parse(&markdown) {
1448 prop_assert_eq!(result.heading_blocks.len(), num_blocks);
1450
1451 for block in &result.heading_blocks {
1453 prop_assert!(block.content.len() > block_size / 2);
1454 }
1455
1456 prop_assert!(result.line_count >= num_blocks * 3); }
1459 }
1460
1461 #[test]
1462 fn test_markdown_syntax_edge_cases(
1463 syntax_char in prop_oneof![
1464 Just("*"), Just("_"), Just("`"), Just("~"),
1465 Just("["), Just("]"), Just("("), Just(")"),
1466 Just("!"), Just("#"), Just(">"), Just("-"),
1467 Just("+"), Just("="), Just("|"), Just("\\")
1468 ]
1469 ) {
1470 let mut parser = create_test_parser();
1471
1472 let markdown = format!(
1474 "# Test Heading\n\nContent with {syntax_char} special {syntax_char} characters {syntax_char} here.\n\n## Another {syntax_char}\n\nMore {syntax_char} content."
1475 );
1476
1477 if let Ok(result) = parser.parse(&markdown) {
1478 prop_assert!(!result.heading_blocks.is_empty());
1480
1481 let has_special_chars = result.heading_blocks.iter()
1483 .any(|block| block.content.contains(syntax_char));
1484 prop_assert!(has_special_chars, "Special characters should be preserved");
1485 }
1486 }
1487
1488 #[test]
1489 fn test_heading_with_formatting(
1490 format_type in prop_oneof"),
1495 Just("~~strike~~")
1496 ],
1497 heading_text in r"[a-zA-Z ]{5,20}"
1498 ) {
1499 let mut parser = create_test_parser();
1500 let formatted_heading = format!("# {heading_text} {format_type}\n\nContent here.");
1501
1502 if let Ok(result) = parser.parse(&formatted_heading) {
1503 prop_assert!(!result.heading_blocks.is_empty());
1505
1506 let heading_found = result.heading_blocks.iter()
1507 .any(|block| block.path.iter()
1508 .any(|p| p.contains(heading_text.trim())));
1509 prop_assert!(heading_found, "Should find heading text");
1510 }
1511 }
1512
1513 #[test]
1514 fn test_random_whitespace_patterns(
1515 spaces_before in 0usize..4, spaces_after in 0usize..10,
1517 tabs_mixed in 0usize..5
1518 ) {
1519 let mut parser = create_test_parser();
1520
1521 let whitespace_prefix = " ".repeat(spaces_before); let whitespace_suffix = format!("{}{}",
1525 " ".repeat(spaces_after),
1526 "\t".repeat(tabs_mixed));
1527
1528 let markdown = format!("{whitespace_prefix}# Test Heading{whitespace_suffix}\n\nContent here.");
1529
1530 if let Ok(result) = parser.parse(&markdown) {
1531 prop_assert!(!result.heading_blocks.is_empty());
1534
1535 let found_heading = result.heading_blocks.iter()
1537 .any(|block| block.path.iter()
1538 .any(|p| p.contains("Test Heading")));
1539 prop_assert!(found_heading, "Should find heading with {} spaces before", spaces_before);
1540 }
1541 }
1542
1543 #[test]
1544 fn test_content_with_code_blocks(
1545 language in prop_oneof![
1546 Just("rust"), Just("javascript"), Just("python"),
1547 Just("bash"), Just("json"), Just("")
1548 ],
1549 code_lines in prop::collection::vec(r"[a-zA-Z0-9 ]{0,50}", 1..10)
1550 ) {
1551 let mut parser = create_test_parser();
1552
1553 let code_content = code_lines.join("\n");
1554 let markdown = format!(
1555 "# Code Example\n\nHere's some code:\n\n```{language}\n{code_content}\n```\n\n## After Code\n\nMore content."
1556 );
1557
1558 if let Ok(result) = parser.parse(&markdown) {
1559 prop_assert!(!result.heading_blocks.is_empty());
1561
1562 let has_code = result.heading_blocks.iter()
1564 .any(|block| block.content.contains(&code_content));
1565 prop_assert!(has_code, "Code content should be preserved");
1566
1567 let headings: Vec<_> = result.heading_blocks.iter()
1569 .flat_map(|block| &block.path)
1570 .collect();
1571 let has_main = headings.iter().any(|h| h.contains("Code Example"));
1572 let has_after = headings.iter().any(|h| h.contains("After Code"));
1573
1574 prop_assert!(has_main || has_after, "Should find at least one heading");
1575 }
1576 }
1577 }
1578
1579 #[test]
1581 fn test_parser_handles_malicious_markdown() -> Result<()> {
1582 let malicious_inputs = vec![
1584 format!("# {}", "A".repeat(10000)),
1586 (1..=100)
1588 .map(|i| format!("{} Level {}", "#".repeat(i % 6 + 1), i))
1589 .collect::<Vec<_>>()
1590 .join("\n"),
1591 "# \u{202e}reversed\u{202d} heading".to_string(),
1593 "# Heading with \x00 null \x01 characters".to_string(),
1595 format!(
1597 "# Top\n{}",
1598 (2..=50)
1599 .map(|i| format!("{} Level {}", "#".repeat(i), i))
1600 .collect::<Vec<_>>()
1601 .join("\n")
1602 ),
1603 "# Heading 1\r\n## Heading 2\n### Heading 3\r#### Heading 4".to_string(),
1605 ];
1606
1607 let mut parser = create_test_parser();
1608
1609 for malicious_input in malicious_inputs {
1610 let result = parser.parse(&malicious_input);
1612
1613 if let Ok(parse_result) = result {
1615 assert!(parse_result.line_count <= malicious_input.lines().count() + 1);
1617 assert!(!parse_result.heading_blocks.is_empty());
1618 } else {
1619 }
1621 }
1622
1623 Ok(())
1624 }
1625
1626 #[test]
1627 fn test_parser_handles_unicode_content() -> Result<()> {
1628 let unicode_markdown = r"# 日本語のヘッダー
1630
1631これは日本語のコンテンツです。
1632
1633## العنوان العربي
1634
1635محتوى باللغة العربية.
1636
1637### Заголовок на русском
1638
1639Русский контент.
1640
1641#### 🚀 Emoji Header 🎉
1642
1643Content with emojis: 😀 🎈 🌟
1644
1645##### Mixed: English 中文 العربية русский
1646";
1647
1648 let mut parser = create_test_parser();
1649
1650 let result = parser.parse(unicode_markdown)?;
1652
1653 assert!(!result.heading_blocks.is_empty());
1655 assert!(!result.toc.is_empty());
1656
1657 let all_paths: Vec<_> = result
1659 .heading_blocks
1660 .iter()
1661 .flat_map(|block| &block.path)
1662 .collect();
1663
1664 assert!(all_paths.iter().any(|p| p.contains("日本語")));
1665 assert!(all_paths.iter().any(|p| p.contains("العربي")));
1666 assert!(all_paths.iter().any(|p| p.contains("русском")));
1667 assert!(all_paths.iter().any(|p| p.contains("🚀")));
1668
1669 Ok(())
1670 }
1671
1672 #[test]
1673 fn test_parser_memory_efficiency() -> Result<()> {
1674 let large_doc = format!(
1676 "# Main\n\n{}\n\n## Sub\n\n{}",
1677 "Content line.\n".repeat(1000),
1678 "More content.\n".repeat(1000)
1679 );
1680
1681 let mut parser = create_test_parser();
1682
1683 let result = parser.parse(&large_doc)?;
1685
1686 assert!(!result.heading_blocks.is_empty());
1688 assert_eq!(result.line_count, large_doc.lines().count());
1689
1690 let main_block = result
1692 .heading_blocks
1693 .iter()
1694 .find(|block| block.path.contains(&"Main".to_string()));
1695 assert!(main_block.is_some());
1696
1697 Ok(())
1698 }
1699
1700 #[test]
1701 fn test_parser_edge_cases() -> Result<()> {
1702 let edge_cases = vec![
1704 " \n\t\n ",
1706 "# A\n## B\n### C\n#### D",
1708 "# !!!\n## ???\n### ***",
1710 "#\n##\n###",
1712 "# Heading \n## Another ",
1714 "# ATX Style\nSetext Style\n============",
1716 ];
1717
1718 let mut parser = create_test_parser();
1719
1720 for edge_case in edge_cases {
1721 let result = parser.parse(edge_case);
1723
1724 match result {
1726 Ok(parse_result) => {
1727 assert!(parse_result.line_count == edge_case.lines().count());
1728 assert!(!parse_result.heading_blocks.is_empty()); },
1730 Err(e) => {
1731 assert!(e.to_string().contains("parse") || e.to_string().contains("Parse"));
1733 },
1734 }
1735 }
1736
1737 Ok(())
1738 }
1739
1740 #[test]
1741 fn test_diagnostic_generation() -> Result<()> {
1742 let problematic_markdown = r"Some content without headings
1744
1745More content here
1746
1747And even more content
1748";
1749
1750 let mut parser = create_test_parser();
1751
1752 let result = parser.parse(problematic_markdown)?;
1754
1755 assert!(!result.diagnostics.is_empty());
1757
1758 let warning_diagnostic = result.diagnostics.iter().find(|d| {
1759 matches!(d.severity, DiagnosticSeverity::Warn) && d.message.contains("No headings")
1760 });
1761 assert!(warning_diagnostic.is_some());
1762
1763 Ok(())
1764 }
1765
1766 #[test]
1767 fn test_parser_consistency() -> Result<()> {
1768 let mut parser = create_test_parser();
1770 let markdown = simple_markdown();
1771
1772 let result1 = parser.parse(markdown)?;
1774 let result2 = parser.parse(markdown)?;
1775
1776 assert_eq!(result1.heading_blocks.len(), result2.heading_blocks.len());
1778 assert_eq!(result1.toc.len(), result2.toc.len());
1779 assert_eq!(result1.line_count, result2.line_count);
1780
1781 for (block1, block2) in result1
1783 .heading_blocks
1784 .iter()
1785 .zip(result2.heading_blocks.iter())
1786 {
1787 assert_eq!(block1.path, block2.path);
1788 assert_eq!(block1.start_line, block2.start_line);
1789 assert_eq!(block1.end_line, block2.end_line);
1790 }
1791
1792 Ok(())
1793 }
1794
1795 #[test]
1796 #[allow(clippy::similar_names)] fn test_heading_blocks_no_duplication() -> Result<()> {
1798 let markdown = r"# First Heading
1800SENTINEL_FIRST_START
1801Content under first heading
1802with multiple lines
1803SENTINEL_FIRST_END
1804
1805## First Sub
1806SENTINEL_SUB_START
1807Content under first sub
1808SENTINEL_SUB_END
1809
1810## Second Sub
1811SENTINEL_SUB2_START
1812Content under second sub
1813SENTINEL_SUB2_END
1814
1815# Second Heading
1816SENTINEL_SECOND_START
1817Final content
1818SENTINEL_SECOND_END";
1819
1820 let mut parser = create_test_parser();
1821 let result = parser.parse(markdown)?;
1822
1823 assert_eq!(
1825 result.heading_blocks.len(),
1826 4,
1827 "Should have 4 heading blocks"
1828 );
1829
1830 for block in &result.heading_blocks {
1832 let first_count = block.content.matches("SENTINEL_FIRST_START").count();
1834 let sub_count = block.content.matches("SENTINEL_SUB_START").count();
1835 let sub2_count = block.content.matches("SENTINEL_SUB2_START").count();
1836 let second_count = block.content.matches("SENTINEL_SECOND_START").count();
1837
1838 assert!(first_count <= 1, "First sentinel duplicated");
1840 assert!(sub_count <= 1, "Sub sentinel duplicated");
1841 assert!(sub2_count <= 1, "Sub2 sentinel duplicated");
1842 assert!(second_count <= 1, "Second sentinel duplicated");
1843 }
1844
1845 let first_block = &result.heading_blocks[0];
1847 assert!(first_block.content.contains("SENTINEL_FIRST_START"));
1848 assert!(first_block.content.contains("SENTINEL_FIRST_END"));
1849 assert!(!first_block.content.contains("SENTINEL_SUB_START"));
1850
1851 let sub_block = &result.heading_blocks[1];
1852 assert!(sub_block.content.contains("SENTINEL_SUB_START"));
1853 assert!(sub_block.content.contains("SENTINEL_SUB_END"));
1854 assert!(!sub_block.content.contains("SENTINEL_FIRST"));
1855 assert!(!sub_block.content.contains("SENTINEL_SUB2"));
1856
1857 let sub2_block = &result.heading_blocks[2];
1858 assert!(sub2_block.content.contains("SENTINEL_SUB2_START"));
1859 assert!(sub2_block.content.contains("SENTINEL_SUB2_END"));
1860 assert!(!sub2_block.content.contains("SENTINEL_SUB_START"));
1861 assert!(!sub2_block.content.contains("SENTINEL_SECOND"));
1862
1863 let second_block = &result.heading_blocks[3];
1864 assert!(second_block.content.contains("SENTINEL_SECOND_START"));
1865 assert!(second_block.content.contains("SENTINEL_SECOND_END"));
1866 assert!(!second_block.content.contains("SENTINEL_FIRST"));
1867 assert!(!second_block.content.contains("SENTINEL_SUB"));
1868
1869 Ok(())
1870 }
1871
1872 #[test]
1873 fn test_line_ranges_accuracy() -> Result<()> {
1874 let markdown = "# Heading at Line 1\nLine 2\nLine 3\nLine 4\nLine 5\n## Sub at Line 6\nLine 7\nLine 8\n# Another at Line 9\nLine 10";
1876
1877 let mut parser = create_test_parser();
1878 let result = parser.parse(markdown)?;
1879
1880 assert_eq!(result.line_count, 10, "Should have 10 lines total");
1881 assert_eq!(
1882 result.heading_blocks.len(),
1883 3,
1884 "Should have 3 heading blocks"
1885 );
1886
1887 let first = &result.heading_blocks[0];
1889 assert_eq!(first.path, vec!["Heading at Line 1"]);
1890 assert_eq!(first.start_line, 1, "First heading starts at line 1");
1891 assert_eq!(first.end_line, 5, "First heading ends at line 5");
1892
1893 let second = &result.heading_blocks[1];
1895 assert_eq!(second.path, vec!["Heading at Line 1", "Sub at Line 6"]);
1896 assert_eq!(second.start_line, 6, "Sub heading starts at line 6");
1897 assert_eq!(second.end_line, 8, "Sub heading ends at line 8");
1898
1899 let third = &result.heading_blocks[2];
1901 assert_eq!(third.path, vec!["Another at Line 9"]);
1902 assert_eq!(third.start_line, 9, "Another heading starts at line 9");
1903 assert_eq!(third.end_line, 10, "Another heading ends at line 10");
1904
1905 Ok(())
1906 }
1907
1908 #[test]
1909 fn test_unicode_mixed_headings_edge_cases() -> Result<()> {
1910 let markdown = r"# 🔥 Main Section
1912Content with emoji
1913
1914## Ünïcödë Heading
1915Спецйальные символы
1916
1917### Deep → Nested ← Section
1918More content here
1919
1920#### Even Deeper
1921Nested content
1922
1923##### Fifth Level
1924Very deep
1925
1926###### Sixth Level
1927Deepest level
1928
1929### Back to Level 3
1930After deep nesting";
1931
1932 let mut parser = create_test_parser();
1933 let result = parser.parse(markdown)?;
1934
1935 assert!(
1937 result.heading_blocks.len() >= 7,
1938 "Should extract all heading levels"
1939 );
1940
1941 assert!(result.heading_blocks[0].path[0].contains("🔥"));
1943 assert!(result.heading_blocks[1].path[1].contains("Ünïcödë"));
1944
1945 let deep_block = result
1947 .heading_blocks
1948 .iter()
1949 .find(|b| b.path.last().is_some_and(|p| p.contains("Fifth Level")))
1950 .expect("Should find Fifth Level heading");
1951 assert!(
1952 deep_block.path.len() >= 5,
1953 "Fifth level should be deeply nested"
1954 );
1955
1956 let back_block = result
1958 .heading_blocks
1959 .iter()
1960 .find(|b| b.path.last().is_some_and(|p| p.contains("Back to Level 3")))
1961 .expect("Should find Back to Level 3 heading");
1962 assert_eq!(
1963 back_block.path.len(),
1964 3,
1965 "Should be at level 3 after backtracking"
1966 );
1967
1968 Ok(())
1969 }
1970}