1use crate::{
140 Diagnostic, DiagnosticSeverity, Error, HeadingBlock, Result, TocEntry, heading::path_variants,
141};
142use base64::{Engine, engine::general_purpose::STANDARD as B64};
143use sha2::{Digest, Sha256};
144const FALLBACK_WINDOW_LINES: usize = 200;
146use std::collections::VecDeque;
147use tree_sitter::{Node, Parser, TreeCursor};
148
149pub struct MarkdownParser {
175 parser: Parser,
180}
181
182impl MarkdownParser {
183 pub fn new() -> Result<Self> {
219 let mut parser = Parser::new();
220 parser
221 .set_language(&tree_sitter_md::LANGUAGE.into())
222 .map_err(|e| Error::Parse(format!("Failed to set language: {e}")))?;
223
224 Ok(Self { parser })
225 }
226
227 pub fn parse(&mut self, text: &str) -> Result<ParseResult> {
305 let tree = self
306 .parser
307 .parse(text, None)
308 .ok_or_else(|| Error::Parse("Failed to parse markdown".into()))?;
309
310 let root = tree.root_node();
311 let mut diagnostics = Vec::new();
312 let mut heading_blocks = Vec::new();
313 let mut toc = Vec::new();
314
315 if root.has_error() {
316 diagnostics.push(Diagnostic {
317 severity: DiagnosticSeverity::Warn,
318 message: "Parse tree contains errors, using fallback parsing".into(),
319 line: None,
320 });
321 }
322
323 let mut cursor = root.walk();
324 Self::extract_headings(&mut cursor, text, &mut heading_blocks, &mut toc);
325
326 if heading_blocks.is_empty() {
327 diagnostics.push(Diagnostic {
328 severity: DiagnosticSeverity::Warn,
329 message: "No headings found in document".into(),
330 line: Some(1),
331 });
332
333 let total_lines = text.lines().count();
336 if total_lines <= FALLBACK_WINDOW_LINES {
337 let path = vec!["Document".into()];
338 let variants = path_variants(&path);
339 heading_blocks.push(HeadingBlock {
340 path,
341 display_path: variants.display_segments,
342 normalized_tokens: variants.tokens,
343 content: text.to_string(),
344 start_line: 1,
345 end_line: total_lines,
346 });
347 } else {
348 let mut start = 1usize;
349 let mut current = String::new();
350 let mut count = 0usize;
351 for line in text.lines() {
352 if count > 0 {
353 current.push('\n');
354 }
355 current.push_str(line);
356 count += 1;
357 if count == FALLBACK_WINDOW_LINES {
358 let end_line = start + count - 1;
359 let path = vec!["Document".into()];
360 let variants = path_variants(&path);
361 heading_blocks.push(HeadingBlock {
362 path,
363 display_path: variants.display_segments,
364 normalized_tokens: variants.tokens,
365 content: std::mem::take(&mut current),
366 start_line: start,
367 end_line,
368 });
369 start = end_line + 1;
370 count = 0;
371 }
372 }
373 if !current.is_empty() {
374 let end_line = start + count - 1;
375 let path = vec!["Document".into()];
376 let variants = path_variants(&path);
377 heading_blocks.push(HeadingBlock {
378 path,
379 display_path: variants.display_segments,
380 normalized_tokens: variants.tokens,
381 content: current,
382 start_line: start,
383 end_line,
384 });
385 }
386 }
387 }
388
389 let line_count = text.lines().count();
390
391 Ok(ParseResult {
392 heading_blocks,
393 toc,
394 diagnostics,
395 line_count,
396 })
397 }
398
399 fn extract_headings(
400 cursor: &mut TreeCursor,
401 text: &str,
402 blocks: &mut Vec<HeadingBlock>,
403 toc: &mut Vec<TocEntry>,
404 ) {
405 #[derive(Debug)]
407 struct HeadingInfo {
408 level: usize,
409 text: String,
410 byte_start: usize,
411 line_start: usize,
412 }
413
414 let mut headings = Vec::new();
415
416 Self::walk_tree(cursor, text, |node| {
418 if node.kind() == "atx_heading" {
419 let level = Self::get_heading_level(node, text);
420 let heading_text = Self::get_heading_text(node, text);
421 let line_start = node.start_position().row;
422
423 headings.push(HeadingInfo {
424 level,
425 text: heading_text,
426 byte_start: node.byte_range().start,
427 line_start,
428 });
429 }
430 });
431
432 if headings.is_empty() {
434 return;
435 }
436
437 headings.sort_by_key(|h| h.byte_start);
439
440 let mut current_path = Vec::new();
442 let mut stack: VecDeque<usize> = VecDeque::new();
443 let mut baseline_level: Option<usize> = None;
444
445 for i in 0..headings.len() {
446 let heading = &headings[i];
447
448 let trimmed = heading.text.trim();
450 if heading.level == 1 && trimmed.starts_with("404") {
451 current_path.clear();
453 stack.clear();
454 continue;
455 }
456
457 if baseline_level.is_none_or(|level| heading.level < level) {
458 baseline_level = Some(heading.level);
459 }
460 let baseline = baseline_level.unwrap_or(1);
461 let effective_level = heading
462 .level
463 .saturating_sub(baseline.saturating_sub(1))
464 .max(1);
465
466 while stack.len() >= effective_level {
467 stack.pop_back();
468 current_path.pop();
469 }
470 current_path.push(heading.text.clone());
471 stack.push_back(effective_level);
472
473 let content_start = heading.byte_start;
475 let content_end = if i + 1 < headings.len() {
476 headings[i + 1].byte_start
477 } else {
478 text.len()
479 };
480
481 let content = &text[content_start..content_end];
483
484 let start_line = heading.line_start + 1; let end_line = if i + 1 < headings.len() {
487 headings[i + 1].line_start } else {
489 text.lines().count()
490 };
491
492 let variants = path_variants(¤t_path);
493 let display_path = variants.display_segments.clone();
494 let normalized_segments = variants.normalized_segments.clone();
495 let normalized_tokens = variants.tokens.clone();
496
497 blocks.push(HeadingBlock {
499 path: current_path.clone(),
500 display_path: display_path.clone(),
501 normalized_tokens: normalized_tokens.clone(),
502 content: content.to_string(),
503 start_line,
504 end_line,
505 });
506
507 let anchor = Some(Self::compute_anchor(¤t_path, &heading.text, content));
509
510 let entry = TocEntry {
512 heading_path: current_path.clone(),
513 heading_path_display: Some(display_path),
514 heading_path_normalized: Some(normalized_segments),
515 lines: if end_line > start_line {
516 format!("{start_line}-{end_line}")
517 } else {
518 format!("{start_line}")
519 },
520 anchor,
521 children: Vec::new(),
522 };
523
524 Self::add_to_toc(toc, entry, stack.len());
525 }
526 }
527
528 fn compute_anchor(_path: &[String], heading_text: &str, _content: &str) -> String {
529 let mut hasher = Sha256::new();
530 hasher.update(heading_text.trim().to_lowercase().as_bytes());
532 let digest = hasher.finalize();
533 let full = B64.encode(digest);
534 full[..22.min(full.len())].to_string()
536 }
537
538 fn walk_tree<F>(cursor: &mut TreeCursor, _text: &str, mut callback: F)
539 where
540 F: FnMut(Node),
541 {
542 loop {
543 let node = cursor.node();
544 callback(node);
545
546 if cursor.goto_first_child() {
547 continue;
548 }
549
550 if cursor.goto_next_sibling() {
551 continue;
552 }
553
554 loop {
555 if !cursor.goto_parent() {
556 return;
557 }
558 if cursor.goto_next_sibling() {
559 break;
560 }
561 }
562 }
563 }
564
565 fn get_heading_level(node: Node, _text: &str) -> usize {
566 for child in node.children(&mut node.walk()) {
567 if child.kind() == "atx_h1_marker" {
568 return 1;
569 } else if child.kind() == "atx_h2_marker" {
570 return 2;
571 } else if child.kind() == "atx_h3_marker" {
572 return 3;
573 } else if child.kind() == "atx_h4_marker" {
574 return 4;
575 } else if child.kind() == "atx_h5_marker" {
576 return 5;
577 } else if child.kind() == "atx_h6_marker" {
578 return 6;
579 }
580 }
581 1
582 }
583
584 fn get_heading_text(node: Node, text: &str) -> String {
585 for child in node.children(&mut node.walk()) {
586 if child.kind().contains("heading") && child.kind().contains("content") {
587 return text[child.byte_range()].trim().to_string();
588 }
589 }
590
591 let full_text = &text[node.byte_range()];
592 full_text.trim_start_matches('#').trim().to_string()
593 }
594
595 fn add_to_toc(toc: &mut Vec<TocEntry>, entry: TocEntry, depth: usize) {
596 if depth == 1 {
597 toc.push(entry);
598 } else if let Some(parent) = toc.last_mut() {
599 Self::add_to_toc_recursive(&mut parent.children, entry, depth - 1);
600 }
601 }
602
603 fn add_to_toc_recursive(toc: &mut Vec<TocEntry>, entry: TocEntry, depth: usize) {
604 if depth == 1 {
605 toc.push(entry);
606 } else if let Some(parent) = toc.last_mut() {
607 Self::add_to_toc_recursive(&mut parent.children, entry, depth - 1);
608 }
609 }
610}
611
612#[derive(Clone)]
674pub struct ParseResult {
675 pub heading_blocks: Vec<HeadingBlock>,
688
689 pub toc: Vec<TocEntry>,
701
702 pub diagnostics: Vec<Diagnostic>,
715
716 pub line_count: usize,
722}
723
724#[cfg(test)]
728#[allow(
729 clippy::unwrap_used,
730 clippy::unnecessary_wraps,
731 clippy::format_push_string,
732 clippy::disallowed_macros
733)]
734mod tests {
735 use super::*;
736 use proptest::prelude::*;
737
738 fn create_test_parser() -> MarkdownParser {
740 MarkdownParser::new().expect("Failed to create parser")
741 }
742
743 #[test]
744 fn test_anchor_stability_when_section_moves() {
745 let mut parser = create_test_parser();
746
747 let doc_v1 = "# Intro\n\nPrelude.\n\n## Section A\n\nAlpha content line 1.\nAlpha content line 2.\n\n## Section B\n\nBeta content.\n";
748
749 let result_v1 = parser.parse(doc_v1).expect("parse v1");
750 #[allow(clippy::items_after_statements)]
751 fn find<'a>(entries: &'a [TocEntry], name: &str) -> Option<&'a TocEntry> {
752 for e in entries {
753 if e.heading_path.last().is_some_and(|h| h == name) {
754 return Some(e);
755 }
756 if let Some(found) = find(&e.children, name) {
757 return Some(found);
758 }
759 }
760 None
761 }
762 let a_v1 = find(&result_v1.toc, "Section A").expect("section A in v1");
763 let anchor_v1 = a_v1.anchor.clone().expect("anchor v1");
764 let lines_v1 = a_v1.lines.clone();
765
766 let doc_v2 = "# Intro\n\nPrelude.\n\n## Section B\n\nBeta content.\n\n## Section A\n\nAlpha content line 1.\nAlpha content line 2.\n";
768 let result_v2 = parser.parse(doc_v2).expect("parse v2");
769 let a_v2 = find(&result_v2.toc, "Section A").expect("section A in v2");
770 let anchor_v2 = a_v2.anchor.clone().expect("anchor v2");
771 let lines_v2 = a_v2.lines.clone();
772
773 assert_eq!(anchor_v1, anchor_v2, "anchor stable across moves");
775 assert_ne!(lines_v1, lines_v2, "lines should reflect new position");
776 }
777
778 #[test]
779 fn test_skips_placeholder_404_headings() -> Result<()> {
780 let mut parser = create_test_parser();
781
782 let doc = r"# 404
783
784Check the URL.
785
786## Actual Section
787
788Real content lives here.
789
790### Nested Detail
791
792Additional context.
793
794## Follow Up
795
796More guidance.
797";
798
799 let result = parser.parse(doc)?;
800
801 assert_eq!(
802 result.toc.len(),
803 2,
804 "top-level entries should ignore 404 headings"
805 );
806 assert!(
807 result.toc.iter().all(|entry| entry
808 .heading_path
809 .iter()
810 .all(|component| !component.starts_with("404"))),
811 "toc should not contain placeholder 404 entries"
812 );
813 assert_eq!(
814 result.heading_blocks.len(),
815 3,
816 "children under 404 should remain accessible"
817 );
818 assert_eq!(result.heading_blocks[0].path[0], "Actual Section");
819
820 Ok(())
821 }
822
823 fn simple_markdown() -> &'static str {
824 r"# Main Heading
825
826This is some content under the main heading.
827
828## Sub Heading
829
830More content here.
831
832### Deep Heading
833
834Even deeper content.
835
836## Another Sub
837
838Final content.
839"
840 }
841
842 fn complex_markdown() -> &'static str {
843 r#"# Getting Started
844
845Welcome to our documentation!
846
847## Installation
848
849Run the following command:
850
851```bash
852npm install
853```
854
855### Requirements
856
857- Node.js 16+
858- npm 7+
859
860## Usage
861
862Here's how to use it:
863
8641. First step
8652. Second step
866
867### Advanced Usage
868
869For advanced users:
870
871#### Configuration
872
873Edit the config file:
874
875```json
876{
877 "key": "value"
878}
879```
880
881## Troubleshooting
882
883Common issues:
884
885- Issue 1
886- Issue 2
887"#
888 }
889
890 fn malformed_markdown() -> &'static str {
891 r"# Broken Heading
892## Missing content
893
894### Unmatched brackets ][
895
896Content with `unclosed code
897
898> Broken quote
899>> Nested broken quote
900
901* List item
902 * Nested without proper spacing
903* Another item
904
905```
906Unclosed code block
907"
908 }
909
910 #[test]
911 fn test_parser_creation() {
912 let result = MarkdownParser::new();
915
916 assert!(result.is_ok());
918 }
919
920 #[test]
921 fn test_parse_simple_markdown() -> Result<()> {
922 let mut parser = create_test_parser();
924 let markdown = simple_markdown();
925
926 let result = parser.parse(markdown)?;
928
929 assert!(!result.heading_blocks.is_empty());
931 assert!(!result.toc.is_empty());
932 assert_eq!(result.line_count, markdown.lines().count());
933
934 let main_heading = result
936 .heading_blocks
937 .iter()
938 .find(|block| block.path.contains(&"Main Heading".to_string()));
939 assert!(main_heading.is_some());
940
941 let sub_heading = result
943 .heading_blocks
944 .iter()
945 .find(|block| block.path.contains(&"Sub Heading".to_string()));
946 assert!(sub_heading.is_some());
947
948 Ok(())
949 }
950
951 #[test]
952 fn test_parse_complex_markdown_structure() -> Result<()> {
953 let mut parser = create_test_parser();
955 let markdown = complex_markdown();
956
957 let result = parser.parse(markdown)?;
959
960 assert!(result.heading_blocks.len() >= 5); let headings: Vec<_> = result
965 .heading_blocks
966 .iter()
967 .flat_map(|block| &block.path)
968 .collect();
969
970 assert!(headings.iter().any(|h| h.contains("Getting Started")));
971 assert!(headings.iter().any(|h| h.contains("Installation")));
972 assert!(headings.iter().any(|h| h.contains("Requirements")));
973 assert!(headings.iter().any(|h| h.contains("Configuration")));
974
975 assert!(!result.toc.is_empty());
977 let top_level = &result.toc[0];
978 assert!(
979 top_level
980 .heading_path
981 .contains(&"Getting Started".to_string())
982 );
983
984 Ok(())
985 }
986
987 #[test]
988 fn test_parse_malformed_markdown() -> Result<()> {
989 let mut parser = create_test_parser();
991 let markdown = malformed_markdown();
992
993 let result = parser.parse(markdown)?;
995
996 assert!(!result.heading_blocks.is_empty()); Ok(())
1003 }
1004
1005 #[test]
1006 fn test_parse_empty_document() -> Result<()> {
1007 let mut parser = create_test_parser();
1009 let empty = "";
1010
1011 let result = parser.parse(empty)?;
1013
1014 assert_eq!(result.line_count, 0);
1016 assert!(result.heading_blocks.len() <= 1); assert!(
1018 result
1019 .diagnostics
1020 .iter()
1021 .any(|d| d.message.contains("No headings found")
1022 || d.severity == DiagnosticSeverity::Warn)
1023 );
1024
1025 Ok(())
1026 }
1027
1028 #[test]
1029 fn test_parse_document_without_headings() -> Result<()> {
1030 let mut parser = create_test_parser();
1032 let no_headings = r"This is just plain text.
1033
1034With multiple paragraphs.
1035
1036And some more content.
1037
1038But no headings at all.
1039";
1040
1041 let result = parser.parse(no_headings)?;
1043
1044 assert_eq!(result.heading_blocks.len(), 1);
1046 let block = &result.heading_blocks[0];
1047 assert_eq!(block.path, vec!["Document".to_string()]);
1048 assert_eq!(block.content.trim(), no_headings.trim());
1049
1050 assert!(
1052 result
1053 .diagnostics
1054 .iter()
1055 .any(|d| d.message.contains("No headings found"))
1056 );
1057
1058 Ok(())
1059 }
1060
1061 #[test]
1062 fn test_windowed_segmentation_for_large_unstructured() -> Result<()> {
1063 let mut parser = create_test_parser();
1065 let total = FALLBACK_WINDOW_LINES * 2 + 25; let doc = (1..=total)
1067 .map(|i| format!("line {i}"))
1068 .collect::<Vec<_>>()
1069 .join("\n");
1070
1071 let result = parser.parse(&doc)?;
1073
1074 assert_eq!(result.heading_blocks.len(), 3);
1076 for b in &result.heading_blocks {
1077 assert_eq!(b.path, vec!["Document".to_string()]);
1078 assert!(b.start_line >= 1);
1079 assert!(b.end_line <= total);
1080 }
1081 assert_eq!(result.heading_blocks.last().unwrap().end_line, total);
1082
1083 Ok(())
1084 }
1085
1086 #[test]
1087 fn test_heading_level_detection() -> Result<()> {
1088 let mut parser = create_test_parser();
1090 let multilevel = r"# Level 1
1091
1092## Level 2
1093
1094### Level 3
1095
1096#### Level 4
1097
1098##### Level 5
1099
1100###### Level 6
1101";
1102
1103 let result = parser.parse(multilevel)?;
1105
1106 assert!(result.heading_blocks.len() >= 6);
1108
1109 let paths: Vec<_> = result
1111 .heading_blocks
1112 .iter()
1113 .map(|block| block.path.len())
1114 .collect();
1115
1116 assert!(paths.contains(&1)); assert!(paths.contains(&2)); assert!(paths.iter().any(|&len| len >= 3)); Ok(())
1122 }
1123
1124 #[test]
1125 fn test_heading_text_extraction() -> Result<()> {
1126 let mut parser = create_test_parser();
1128 let formatted_headings = r"# **Bold Heading**
1129
1130## _Italic Heading_
1131
1132### `Code in Heading`
1133
1134#### Heading with [Link](http://example.com)
1135
1136##### Heading with **bold** and _italic_
1137";
1138
1139 let result = parser.parse(formatted_headings)?;
1141
1142 let heading_texts: Vec<_> = result
1144 .heading_blocks
1145 .iter()
1146 .flat_map(|block| &block.path)
1147 .collect();
1148
1149 assert!(heading_texts.iter().any(|h| h.contains("Bold Heading")));
1151 assert!(heading_texts.iter().any(|h| h.contains("Italic Heading")));
1152 assert!(heading_texts.iter().any(|h| h.contains("Code in Heading")));
1153
1154 Ok(())
1155 }
1156
1157 #[test]
1158 fn test_content_extraction() -> Result<()> {
1159 let mut parser = create_test_parser();
1161 let content_markdown = r"# Section A
1162
1163This is content for section A.
1164It spans multiple lines.
1165
1166## Subsection A1
1167
1168More specific content here.
1169
1170# Section B
1171
1172Different content for section B.
1173";
1174
1175 let result = parser.parse(content_markdown)?;
1177
1178 let section_a = result
1180 .heading_blocks
1181 .iter()
1182 .find(|block| block.path.contains(&"Section A".to_string()))
1183 .expect("Section A should be found");
1184
1185 assert!(section_a.content.contains("This is content for section A"));
1186 assert!(section_a.content.contains("multiple lines"));
1187
1188 let section_b = result
1189 .heading_blocks
1190 .iter()
1191 .find(|block| block.path.contains(&"Section B".to_string()))
1192 .expect("Section B should be found");
1193
1194 assert!(
1195 section_b
1196 .content
1197 .contains("Different content for section B")
1198 );
1199
1200 Ok(())
1201 }
1202
1203 #[test]
1204 fn test_line_number_tracking() -> Result<()> {
1205 let mut parser = create_test_parser();
1207 let numbered_content =
1208 "Line 1\n# Heading at line 2\nLine 3\nLine 4\n## Sub at line 5\nLine 6";
1209
1210 let result = parser.parse(numbered_content)?;
1212
1213 assert_eq!(result.line_count, 6);
1215
1216 let heading_block = result
1218 .heading_blocks
1219 .iter()
1220 .find(|block| block.path.contains(&"Heading at line 2".to_string()));
1221
1222 if let Some(block) = heading_block {
1223 assert!(block.start_line >= 1);
1225 assert!(block.end_line <= result.line_count);
1226 assert!(block.start_line <= block.end_line);
1227 }
1228
1229 Ok(())
1230 }
1231
1232 #[test]
1233 fn test_toc_generation() -> Result<()> {
1234 let mut parser = create_test_parser();
1236 let hierarchical = r"# Top Level
1237
1238## First Sub
1239### Deep Sub 1
1240### Deep Sub 2
1241
1242## Second Sub
1243### Another Deep
1244#### Very Deep
1245
1246# Another Top
1247";
1248
1249 let result = parser.parse(hierarchical)?;
1251
1252 assert!(!result.toc.is_empty());
1254
1255 assert!(!result.toc.is_empty());
1257
1258 let first_top = &result.toc[0];
1260 assert!(first_top.heading_path.contains(&"Top Level".to_string()));
1261
1262 if !first_top.children.is_empty() {
1264 let first_sub = &first_top.children[0];
1265 assert!(first_sub.heading_path.len() >= 2); }
1267
1268 Ok(())
1269 }
1270
1271 proptest! {
1273 #[test]
1278 fn test_parser_never_panics_on_arbitrary_input(
1279 content in prop::string::string_regex("[\\x20-\\x7E\\n\\r\\t]{0,500}").unwrap()
1280 ) {
1281 let mut parser = create_test_parser();
1282
1283 let result = parser.parse(&content);
1285
1286 if let Ok(parse_result) = result {
1288 prop_assert!(parse_result.line_count == content.lines().count());
1289 prop_assert!(!parse_result.heading_blocks.is_empty()); } else {
1291 }
1293 }
1294
1295 #[test]
1296 fn test_line_count_accuracy(
1297 lines in prop::collection::vec(
1298 prop::string::string_regex("[\\x20-\\x7E]{0,100}").unwrap(),
1299 0..50
1300 )
1301 ) {
1302 let content = lines.join("\n");
1303 let mut parser = create_test_parser();
1304 let expected_lines = if content.is_empty() {
1307 0
1308 } else {
1309 content.lines().count()
1312 };
1313
1314 if let Ok(result) = parser.parse(&content) {
1315 prop_assert_eq!(result.line_count, expected_lines);
1316 }
1317 }
1318
1319 #[test]
1320 fn test_single_heading_parsing(heading_text in r"[a-zA-Z][a-zA-Z0-9 ]{2,30}") {
1321 let mut parser = create_test_parser();
1322 let markdown = format!("# {heading_text}");
1323
1324 let trimmed = heading_text.trim();
1326 if trimmed.is_empty() || trimmed.len() < 2 {
1327 return Ok(());
1329 }
1330
1331 if let Ok(result) = parser.parse(&markdown) {
1332 prop_assert!(!result.heading_blocks.is_empty());
1334
1335 if !result.toc.is_empty() {
1337 let has_heading = result.heading_blocks.iter()
1338 .any(|block| block.path.iter().any(|p| p.contains(trimmed)));
1339 prop_assert!(has_heading);
1340 }
1341 }
1342 }
1343
1344 #[test]
1345 fn test_heading_level_detection_consistency(
1346 levels in prop::collection::vec(1u8..=6, 1..10)
1347 ) {
1348 let mut parser = create_test_parser();
1349
1350 let mut markdown = String::new();
1352 let mut expected_path_lens = Vec::new();
1353
1354 for (i, level) in levels.iter().enumerate() {
1355 let heading_text = format!("Heading {}", i + 1);
1356 let heading_line = format!("{} {}\n\nContent for heading {}\n\n",
1357 "#".repeat(*level as usize),
1358 heading_text,
1359 i + 1);
1360 markdown.push_str(&heading_line);
1361 expected_path_lens.push(*level as usize);
1362 }
1363
1364 if let Ok(result) = parser.parse(&markdown) {
1365 prop_assert!(result.heading_blocks.len() >= levels.len().min(1));
1367
1368 for (i, expected_depth) in expected_path_lens.iter().enumerate() {
1370 if i < result.heading_blocks.len() {
1371 let actual_depth = result.heading_blocks[i].path.len();
1372 prop_assert!(actual_depth <= *expected_depth);
1374 prop_assert!(actual_depth >= 1);
1375 }
1376 }
1377 }
1378 }
1379
1380 #[test]
1381 fn test_unicode_content_preservation(
1382 content in r"[\u{0080}-\u{FFFF}]{1,100}"
1383 ) {
1384 let mut parser = create_test_parser();
1385 let markdown = format!("# Unicode Test\n\n{content}");
1386
1387 if let Ok(result) = parser.parse(&markdown) {
1388 let has_unicode = result.heading_blocks.iter()
1390 .any(|block| block.content.contains(&content));
1391 prop_assert!(has_unicode, "Unicode content should be preserved");
1392
1393 prop_assert_eq!(result.line_count, markdown.lines().count());
1395 }
1396 }
1397
1398 #[test]
1399 fn test_mixed_line_endings(
1400 line_ending in prop_oneof![Just("\n"), Just("\r\n"), Just("\r")]
1401 ) {
1402 let mut parser = create_test_parser();
1403 let content_lines = ["# Main Heading",
1404 "",
1405 "This is content.",
1406 "",
1407 "## Sub Heading",
1408 "",
1409 "More content here."];
1410
1411 let markdown = content_lines.join(line_ending);
1412
1413 if let Ok(result) = parser.parse(&markdown) {
1414 prop_assert!(!result.heading_blocks.is_empty());
1416
1417 let main_heading = result.heading_blocks.iter()
1419 .any(|block| block.path.iter().any(|p| p.contains("Main Heading")));
1420 let sub_heading = result.heading_blocks.iter()
1421 .any(|block| block.path.iter().any(|p| p.contains("Sub Heading")));
1422
1423 prop_assert!(main_heading || sub_heading, "Should find at least one heading");
1424 }
1425 }
1426
1427 #[test]
1428 fn test_deeply_nested_structure(depth in 1usize..20) {
1429 let mut parser = create_test_parser();
1430 let mut markdown = String::new();
1431
1432 for level in 1..=depth.min(6) {
1434 let heading = format!("{} Level {} Heading\n\nContent at level {}.\n\n",
1435 "#".repeat(level), level, level);
1436 markdown.push_str(&heading);
1437 }
1438
1439 if let Ok(result) = parser.parse(&markdown) {
1440 prop_assert!(!result.heading_blocks.is_empty());
1442 prop_assert!(!result.toc.is_empty());
1443
1444 if let Some(deepest) = result.heading_blocks.iter()
1446 .max_by_key(|block| block.path.len()) {
1447 prop_assert!(deepest.path.len() <= depth.min(6));
1448 }
1449 }
1450 }
1451
1452 #[test]
1453 fn test_large_content_blocks(
1454 block_size in 100usize..5000,
1455 num_blocks in 1usize..10
1456 ) {
1457 let mut parser = create_test_parser();
1458 let mut markdown = String::new();
1459
1460 for i in 0..num_blocks {
1461 markdown.push_str(&format!("# Heading {}\n\n", i + 1));
1462
1463 let content_line = format!("This is line {i} of content. ");
1465 let large_content = content_line.repeat(block_size / content_line.len());
1466 markdown.push_str(&large_content);
1467 markdown.push_str("\n\n");
1468 }
1469
1470 if let Ok(result) = parser.parse(&markdown) {
1471 prop_assert_eq!(result.heading_blocks.len(), num_blocks);
1473
1474 for block in &result.heading_blocks {
1476 prop_assert!(block.content.len() > block_size / 2);
1477 }
1478
1479 prop_assert!(result.line_count >= num_blocks * 3); }
1482 }
1483
1484 #[test]
1485 fn test_markdown_syntax_edge_cases(
1486 syntax_char in prop_oneof![
1487 Just("*"), Just("_"), Just("`"), Just("~"),
1488 Just("["), Just("]"), Just("("), Just(")"),
1489 Just("!"), Just("#"), Just(">"), Just("-"),
1490 Just("+"), Just("="), Just("|"), Just("\\")
1491 ]
1492 ) {
1493 let mut parser = create_test_parser();
1494
1495 let markdown = format!(
1497 "# Test Heading\n\nContent with {syntax_char} special {syntax_char} characters {syntax_char} here.\n\n## Another {syntax_char}\n\nMore {syntax_char} content."
1498 );
1499
1500 if let Ok(result) = parser.parse(&markdown) {
1501 prop_assert!(!result.heading_blocks.is_empty());
1503
1504 let has_special_chars = result.heading_blocks.iter()
1506 .any(|block| block.content.contains(syntax_char));
1507 prop_assert!(has_special_chars, "Special characters should be preserved");
1508 }
1509 }
1510
1511 #[test]
1512 fn test_heading_with_formatting(
1513 format_type in prop_oneof"),
1518 Just("~~strike~~")
1519 ],
1520 heading_text in r"[a-zA-Z ]{5,20}"
1521 ) {
1522 let mut parser = create_test_parser();
1523 let formatted_heading = format!("# {heading_text} {format_type}\n\nContent here.");
1524
1525 if let Ok(result) = parser.parse(&formatted_heading) {
1526 prop_assert!(!result.heading_blocks.is_empty());
1528
1529 let heading_found = result.heading_blocks.iter()
1530 .any(|block| block.path.iter()
1531 .any(|p| p.contains(heading_text.trim())));
1532 prop_assert!(heading_found, "Should find heading text");
1533 }
1534 }
1535
1536 #[test]
1537 fn test_random_whitespace_patterns(
1538 spaces_before in 0usize..4, spaces_after in 0usize..10,
1540 tabs_mixed in 0usize..5
1541 ) {
1542 let mut parser = create_test_parser();
1543
1544 let whitespace_prefix = " ".repeat(spaces_before); let whitespace_suffix = format!("{}{}",
1548 " ".repeat(spaces_after),
1549 "\t".repeat(tabs_mixed));
1550
1551 let markdown = format!("{whitespace_prefix}# Test Heading{whitespace_suffix}\n\nContent here.");
1552
1553 if let Ok(result) = parser.parse(&markdown) {
1554 prop_assert!(!result.heading_blocks.is_empty());
1557
1558 let found_heading = result.heading_blocks.iter()
1560 .any(|block| block.path.iter()
1561 .any(|p| p.contains("Test Heading")));
1562 prop_assert!(found_heading, "Should find heading with {} spaces before", spaces_before);
1563 }
1564 }
1565
1566 #[test]
1567 fn test_content_with_code_blocks(
1568 language in prop_oneof![
1569 Just("rust"), Just("javascript"), Just("python"),
1570 Just("bash"), Just("json"), Just("")
1571 ],
1572 code_lines in prop::collection::vec(r"[a-zA-Z0-9 ]{0,50}", 1..10)
1573 ) {
1574 let mut parser = create_test_parser();
1575
1576 let code_content = code_lines.join("\n");
1577 let markdown = format!(
1578 "# Code Example\n\nHere's some code:\n\n```{language}\n{code_content}\n```\n\n## After Code\n\nMore content."
1579 );
1580
1581 if let Ok(result) = parser.parse(&markdown) {
1582 prop_assert!(!result.heading_blocks.is_empty());
1584
1585 let has_code = result.heading_blocks.iter()
1587 .any(|block| block.content.contains(&code_content));
1588 prop_assert!(has_code, "Code content should be preserved");
1589
1590 let headings: Vec<_> = result.heading_blocks.iter()
1592 .flat_map(|block| &block.path)
1593 .collect();
1594 let has_main = headings.iter().any(|h| h.contains("Code Example"));
1595 let has_after = headings.iter().any(|h| h.contains("After Code"));
1596
1597 prop_assert!(has_main || has_after, "Should find at least one heading");
1598 }
1599 }
1600 }
1601
1602 #[test]
1604 fn test_parser_handles_malicious_markdown() -> Result<()> {
1605 let malicious_inputs = vec![
1607 format!("# {}", "A".repeat(10000)),
1609 (1..=100)
1611 .map(|i| format!("{} Level {}", "#".repeat(i % 6 + 1), i))
1612 .collect::<Vec<_>>()
1613 .join("\n"),
1614 "# \u{202e}reversed\u{202d} heading".to_string(),
1616 "# Heading with \x00 null \x01 characters".to_string(),
1618 format!(
1620 "# Top\n{}",
1621 (2..=50)
1622 .map(|i| format!("{} Level {}", "#".repeat(i), i))
1623 .collect::<Vec<_>>()
1624 .join("\n")
1625 ),
1626 "# Heading 1\r\n## Heading 2\n### Heading 3\r#### Heading 4".to_string(),
1628 ];
1629
1630 let mut parser = create_test_parser();
1631
1632 for malicious_input in malicious_inputs {
1633 let result = parser.parse(&malicious_input);
1635
1636 if let Ok(parse_result) = result {
1638 assert!(parse_result.line_count <= malicious_input.lines().count() + 1);
1640 assert!(!parse_result.heading_blocks.is_empty());
1641 } else {
1642 }
1644 }
1645
1646 Ok(())
1647 }
1648
1649 #[test]
1650 fn test_parser_handles_unicode_content() -> Result<()> {
1651 let unicode_markdown = r"# 日本語のヘッダー
1653
1654これは日本語のコンテンツです。
1655
1656## العنوان العربي
1657
1658محتوى باللغة العربية.
1659
1660### Заголовок на русском
1661
1662Русский контент.
1663
1664#### 🚀 Emoji Header 🎉
1665
1666Content with emojis: 😀 🎈 🌟
1667
1668##### Mixed: English 中文 العربية русский
1669";
1670
1671 let mut parser = create_test_parser();
1672
1673 let result = parser.parse(unicode_markdown)?;
1675
1676 assert!(!result.heading_blocks.is_empty());
1678 assert!(!result.toc.is_empty());
1679
1680 let all_paths: Vec<_> = result
1682 .heading_blocks
1683 .iter()
1684 .flat_map(|block| &block.path)
1685 .collect();
1686
1687 assert!(all_paths.iter().any(|p| p.contains("日本語")));
1688 assert!(all_paths.iter().any(|p| p.contains("العربي")));
1689 assert!(all_paths.iter().any(|p| p.contains("русском")));
1690 assert!(all_paths.iter().any(|p| p.contains("🚀")));
1691
1692 Ok(())
1693 }
1694
1695 #[test]
1696 fn test_parser_memory_efficiency() -> Result<()> {
1697 let large_doc = format!(
1699 "# Main\n\n{}\n\n## Sub\n\n{}",
1700 "Content line.\n".repeat(1000),
1701 "More content.\n".repeat(1000)
1702 );
1703
1704 let mut parser = create_test_parser();
1705
1706 let result = parser.parse(&large_doc)?;
1708
1709 assert!(!result.heading_blocks.is_empty());
1711 assert_eq!(result.line_count, large_doc.lines().count());
1712
1713 let main_block = result
1715 .heading_blocks
1716 .iter()
1717 .find(|block| block.path.contains(&"Main".to_string()));
1718 assert!(main_block.is_some());
1719
1720 Ok(())
1721 }
1722
1723 #[test]
1724 fn test_parser_edge_cases() -> Result<()> {
1725 let edge_cases = vec![
1727 " \n\t\n ",
1729 "# A\n## B\n### C\n#### D",
1731 "# !!!\n## ???\n### ***",
1733 "#\n##\n###",
1735 "# Heading \n## Another ",
1737 "# ATX Style\nSetext Style\n============",
1739 ];
1740
1741 let mut parser = create_test_parser();
1742
1743 for edge_case in edge_cases {
1744 let result = parser.parse(edge_case);
1746
1747 match result {
1749 Ok(parse_result) => {
1750 assert!(parse_result.line_count == edge_case.lines().count());
1751 assert!(!parse_result.heading_blocks.is_empty()); },
1753 Err(e) => {
1754 assert!(e.to_string().contains("parse") || e.to_string().contains("Parse"));
1756 },
1757 }
1758 }
1759
1760 Ok(())
1761 }
1762
1763 #[test]
1764 fn test_diagnostic_generation() -> Result<()> {
1765 let problematic_markdown = r"Some content without headings
1767
1768More content here
1769
1770And even more content
1771";
1772
1773 let mut parser = create_test_parser();
1774
1775 let result = parser.parse(problematic_markdown)?;
1777
1778 assert!(!result.diagnostics.is_empty());
1780
1781 let warning_diagnostic = result.diagnostics.iter().find(|d| {
1782 matches!(d.severity, DiagnosticSeverity::Warn) && d.message.contains("No headings")
1783 });
1784 assert!(warning_diagnostic.is_some());
1785
1786 Ok(())
1787 }
1788
1789 #[test]
1790 fn test_parser_consistency() -> Result<()> {
1791 let mut parser = create_test_parser();
1793 let markdown = simple_markdown();
1794
1795 let result1 = parser.parse(markdown)?;
1797 let result2 = parser.parse(markdown)?;
1798
1799 assert_eq!(result1.heading_blocks.len(), result2.heading_blocks.len());
1801 assert_eq!(result1.toc.len(), result2.toc.len());
1802 assert_eq!(result1.line_count, result2.line_count);
1803
1804 for (block1, block2) in result1
1806 .heading_blocks
1807 .iter()
1808 .zip(result2.heading_blocks.iter())
1809 {
1810 assert_eq!(block1.path, block2.path);
1811 assert_eq!(block1.start_line, block2.start_line);
1812 assert_eq!(block1.end_line, block2.end_line);
1813 }
1814
1815 Ok(())
1816 }
1817
1818 #[test]
1819 #[allow(clippy::similar_names)] fn test_heading_blocks_no_duplication() -> Result<()> {
1821 let markdown = r"# First Heading
1823SENTINEL_FIRST_START
1824Content under first heading
1825with multiple lines
1826SENTINEL_FIRST_END
1827
1828## First Sub
1829SENTINEL_SUB_START
1830Content under first sub
1831SENTINEL_SUB_END
1832
1833## Second Sub
1834SENTINEL_SUB2_START
1835Content under second sub
1836SENTINEL_SUB2_END
1837
1838# Second Heading
1839SENTINEL_SECOND_START
1840Final content
1841SENTINEL_SECOND_END";
1842
1843 let mut parser = create_test_parser();
1844 let result = parser.parse(markdown)?;
1845
1846 assert_eq!(
1848 result.heading_blocks.len(),
1849 4,
1850 "Should have 4 heading blocks"
1851 );
1852
1853 for block in &result.heading_blocks {
1855 let first_count = block.content.matches("SENTINEL_FIRST_START").count();
1857 let sub_count = block.content.matches("SENTINEL_SUB_START").count();
1858 let sub2_count = block.content.matches("SENTINEL_SUB2_START").count();
1859 let second_count = block.content.matches("SENTINEL_SECOND_START").count();
1860
1861 assert!(first_count <= 1, "First sentinel duplicated");
1863 assert!(sub_count <= 1, "Sub sentinel duplicated");
1864 assert!(sub2_count <= 1, "Sub2 sentinel duplicated");
1865 assert!(second_count <= 1, "Second sentinel duplicated");
1866 }
1867
1868 let first_block = &result.heading_blocks[0];
1870 assert!(first_block.content.contains("SENTINEL_FIRST_START"));
1871 assert!(first_block.content.contains("SENTINEL_FIRST_END"));
1872 assert!(!first_block.content.contains("SENTINEL_SUB_START"));
1873
1874 let sub_block = &result.heading_blocks[1];
1875 assert!(sub_block.content.contains("SENTINEL_SUB_START"));
1876 assert!(sub_block.content.contains("SENTINEL_SUB_END"));
1877 assert!(!sub_block.content.contains("SENTINEL_FIRST"));
1878 assert!(!sub_block.content.contains("SENTINEL_SUB2"));
1879
1880 let sub2_block = &result.heading_blocks[2];
1881 assert!(sub2_block.content.contains("SENTINEL_SUB2_START"));
1882 assert!(sub2_block.content.contains("SENTINEL_SUB2_END"));
1883 assert!(!sub2_block.content.contains("SENTINEL_SUB_START"));
1884 assert!(!sub2_block.content.contains("SENTINEL_SECOND"));
1885
1886 let second_block = &result.heading_blocks[3];
1887 assert!(second_block.content.contains("SENTINEL_SECOND_START"));
1888 assert!(second_block.content.contains("SENTINEL_SECOND_END"));
1889 assert!(!second_block.content.contains("SENTINEL_FIRST"));
1890 assert!(!second_block.content.contains("SENTINEL_SUB"));
1891
1892 Ok(())
1893 }
1894
1895 #[test]
1896 fn test_line_ranges_accuracy() -> Result<()> {
1897 let markdown = "# Heading at Line 1\nLine 2\nLine 3\nLine 4\nLine 5\n## Sub at Line 6\nLine 7\nLine 8\n# Another at Line 9\nLine 10";
1899
1900 let mut parser = create_test_parser();
1901 let result = parser.parse(markdown)?;
1902
1903 assert_eq!(result.line_count, 10, "Should have 10 lines total");
1904 assert_eq!(
1905 result.heading_blocks.len(),
1906 3,
1907 "Should have 3 heading blocks"
1908 );
1909
1910 let first = &result.heading_blocks[0];
1912 assert_eq!(first.path, vec!["Heading at Line 1"]);
1913 assert_eq!(first.start_line, 1, "First heading starts at line 1");
1914 assert_eq!(first.end_line, 5, "First heading ends at line 5");
1915
1916 let second = &result.heading_blocks[1];
1918 assert_eq!(second.path, vec!["Heading at Line 1", "Sub at Line 6"]);
1919 assert_eq!(second.start_line, 6, "Sub heading starts at line 6");
1920 assert_eq!(second.end_line, 8, "Sub heading ends at line 8");
1921
1922 let third = &result.heading_blocks[2];
1924 assert_eq!(third.path, vec!["Another at Line 9"]);
1925 assert_eq!(third.start_line, 9, "Another heading starts at line 9");
1926 assert_eq!(third.end_line, 10, "Another heading ends at line 10");
1927
1928 Ok(())
1929 }
1930
1931 #[test]
1932 fn test_unicode_mixed_headings_edge_cases() -> Result<()> {
1933 let markdown = r"# 🔥 Main Section
1935Content with emoji
1936
1937## Ünïcödë Heading
1938Спецйальные символы
1939
1940### Deep → Nested ← Section
1941More content here
1942
1943#### Even Deeper
1944Nested content
1945
1946##### Fifth Level
1947Very deep
1948
1949###### Sixth Level
1950Deepest level
1951
1952### Back to Level 3
1953After deep nesting";
1954
1955 let mut parser = create_test_parser();
1956 let result = parser.parse(markdown)?;
1957
1958 assert!(
1960 result.heading_blocks.len() >= 7,
1961 "Should extract all heading levels"
1962 );
1963
1964 assert!(result.heading_blocks[0].path[0].contains("🔥"));
1966 assert!(result.heading_blocks[1].path[1].contains("Ünïcödë"));
1967
1968 let deep_block = result
1970 .heading_blocks
1971 .iter()
1972 .find(|b| b.path.last().is_some_and(|p| p.contains("Fifth Level")))
1973 .expect("Should find Fifth Level heading");
1974 assert!(
1975 deep_block.path.len() >= 5,
1976 "Fifth level should be deeply nested"
1977 );
1978
1979 let back_block = result
1981 .heading_blocks
1982 .iter()
1983 .find(|b| b.path.last().is_some_and(|p| p.contains("Back to Level 3")))
1984 .expect("Should find Back to Level 3 heading");
1985 assert_eq!(
1986 back_block.path.len(),
1987 3,
1988 "Should be at level 3 after backtracking"
1989 );
1990
1991 Ok(())
1992 }
1993}