use crate::{
Diagnostic, DiagnosticSeverity, Error, HeadingBlock, Result, TocEntry, heading::path_variants,
};
use base64::{Engine, engine::general_purpose::STANDARD as B64};
use sha2::{Digest, Sha256};
const FALLBACK_WINDOW_LINES: usize = 200;
use std::collections::VecDeque;
use tree_sitter::{Node, Parser, TreeCursor};
pub struct MarkdownParser {
parser: Parser,
}
impl MarkdownParser {
pub fn new() -> Result<Self> {
let mut parser = Parser::new();
parser
.set_language(&tree_sitter_md::LANGUAGE.into())
.map_err(|e| Error::Parse(format!("Failed to set language: {e}")))?;
Ok(Self { parser })
}
pub fn parse(&mut self, text: &str) -> Result<ParseResult> {
let tree = self
.parser
.parse(text, None)
.ok_or_else(|| Error::Parse("Failed to parse markdown".into()))?;
let root = tree.root_node();
let mut diagnostics = Vec::new();
let mut heading_blocks = Vec::new();
let mut toc = Vec::new();
if root.has_error() {
diagnostics.push(Diagnostic {
severity: DiagnosticSeverity::Warn,
message: "Parse tree contains errors, using fallback parsing".into(),
line: None,
});
}
let mut cursor = root.walk();
Self::extract_headings(&mut cursor, text, &mut heading_blocks, &mut toc);
if heading_blocks.is_empty() {
diagnostics.push(Diagnostic {
severity: DiagnosticSeverity::Warn,
message: "No headings found in document".into(),
line: Some(1),
});
let total_lines = text.lines().count();
if total_lines <= FALLBACK_WINDOW_LINES {
let path = vec!["Document".into()];
let variants = path_variants(&path);
heading_blocks.push(HeadingBlock {
path,
display_path: variants.display_segments,
normalized_tokens: variants.tokens,
content: text.to_string(),
start_line: 1,
end_line: total_lines,
});
} else {
let mut start = 1usize;
let mut current = String::new();
let mut count = 0usize;
for line in text.lines() {
if count > 0 {
current.push('\n');
}
current.push_str(line);
count += 1;
if count == FALLBACK_WINDOW_LINES {
let end_line = start + count - 1;
let path = vec!["Document".into()];
let variants = path_variants(&path);
heading_blocks.push(HeadingBlock {
path,
display_path: variants.display_segments,
normalized_tokens: variants.tokens,
content: std::mem::take(&mut current),
start_line: start,
end_line,
});
start = end_line + 1;
count = 0;
}
}
if !current.is_empty() {
let end_line = start + count - 1;
let path = vec!["Document".into()];
let variants = path_variants(&path);
heading_blocks.push(HeadingBlock {
path,
display_path: variants.display_segments,
normalized_tokens: variants.tokens,
content: current,
start_line: start,
end_line,
});
}
}
}
let line_count = text.lines().count();
Ok(ParseResult {
heading_blocks,
toc,
diagnostics,
line_count,
})
}
fn extract_headings(
cursor: &mut TreeCursor,
text: &str,
blocks: &mut Vec<HeadingBlock>,
toc: &mut Vec<TocEntry>,
) {
#[derive(Debug)]
struct HeadingInfo {
level: usize,
text: String,
byte_start: usize,
line_start: usize,
}
let mut headings = Vec::new();
Self::walk_tree(cursor, text, |node| {
if node.kind() == "atx_heading" {
let level = Self::get_heading_level(node, text);
let heading_text = Self::get_heading_text(node, text);
let line_start = node.start_position().row;
headings.push(HeadingInfo {
level,
text: heading_text,
byte_start: node.byte_range().start,
line_start,
});
}
});
if headings.is_empty() {
return;
}
headings.sort_by_key(|h| h.byte_start);
let mut current_path = Vec::new();
let mut stack: VecDeque<usize> = VecDeque::new();
let mut baseline_level: Option<usize> = None;
for i in 0..headings.len() {
let heading = &headings[i];
let trimmed = heading.text.trim();
if heading.level == 1 && trimmed.starts_with("404") {
current_path.clear();
stack.clear();
continue;
}
if baseline_level.is_none_or(|level| heading.level < level) {
baseline_level = Some(heading.level);
}
let baseline = baseline_level.unwrap_or(1);
let effective_level = heading
.level
.saturating_sub(baseline.saturating_sub(1))
.max(1);
while stack.len() >= effective_level {
stack.pop_back();
current_path.pop();
}
current_path.push(heading.text.clone());
stack.push_back(effective_level);
let content_start = heading.byte_start;
let content_end = if i + 1 < headings.len() {
headings[i + 1].byte_start
} else {
text.len()
};
let content = &text[content_start..content_end];
let start_line = heading.line_start + 1; let end_line = if i + 1 < headings.len() {
headings[i + 1].line_start } else {
text.lines().count()
};
let variants = path_variants(¤t_path);
let display_path = variants.display_segments.clone();
let normalized_segments = variants.normalized_segments.clone();
let normalized_tokens = variants.tokens.clone();
blocks.push(HeadingBlock {
path: current_path.clone(),
display_path: display_path.clone(),
normalized_tokens: normalized_tokens.clone(),
content: content.to_string(),
start_line,
end_line,
});
let anchor = Some(Self::compute_anchor(¤t_path, &heading.text, content));
let entry = TocEntry {
heading_path: current_path.clone(),
heading_path_display: Some(display_path),
heading_path_normalized: Some(normalized_segments),
lines: if end_line > start_line {
format!("{start_line}-{end_line}")
} else {
format!("{start_line}")
},
anchor,
children: Vec::new(),
};
Self::add_to_toc(toc, entry, stack.len());
}
}
fn compute_anchor(_path: &[String], heading_text: &str, _content: &str) -> String {
let mut hasher = Sha256::new();
hasher.update(heading_text.trim().to_lowercase().as_bytes());
let digest = hasher.finalize();
let full = B64.encode(digest);
full[..22.min(full.len())].to_string()
}
fn walk_tree<F>(cursor: &mut TreeCursor, _text: &str, mut callback: F)
where
F: FnMut(Node),
{
loop {
let node = cursor.node();
callback(node);
if cursor.goto_first_child() {
continue;
}
if cursor.goto_next_sibling() {
continue;
}
loop {
if !cursor.goto_parent() {
return;
}
if cursor.goto_next_sibling() {
break;
}
}
}
}
fn get_heading_level(node: Node, _text: &str) -> usize {
for child in node.children(&mut node.walk()) {
if child.kind() == "atx_h1_marker" {
return 1;
} else if child.kind() == "atx_h2_marker" {
return 2;
} else if child.kind() == "atx_h3_marker" {
return 3;
} else if child.kind() == "atx_h4_marker" {
return 4;
} else if child.kind() == "atx_h5_marker" {
return 5;
} else if child.kind() == "atx_h6_marker" {
return 6;
}
}
1
}
fn get_heading_text(node: Node, text: &str) -> String {
for child in node.children(&mut node.walk()) {
if child.kind().contains("heading") && child.kind().contains("content") {
return text[child.byte_range()].trim().to_string();
}
}
let full_text = &text[node.byte_range()];
full_text.trim_start_matches('#').trim().to_string()
}
fn add_to_toc(toc: &mut Vec<TocEntry>, entry: TocEntry, depth: usize) {
if depth == 1 {
toc.push(entry);
} else if let Some(parent) = toc.last_mut() {
Self::add_to_toc_recursive(&mut parent.children, entry, depth - 1);
}
}
fn add_to_toc_recursive(toc: &mut Vec<TocEntry>, entry: TocEntry, depth: usize) {
if depth == 1 {
toc.push(entry);
} else if let Some(parent) = toc.last_mut() {
Self::add_to_toc_recursive(&mut parent.children, entry, depth - 1);
}
}
}
#[derive(Clone)]
pub struct ParseResult {
pub heading_blocks: Vec<HeadingBlock>,
pub toc: Vec<TocEntry>,
pub diagnostics: Vec<Diagnostic>,
pub line_count: usize,
}
#[cfg(test)]
#[allow(
clippy::unwrap_used,
clippy::unnecessary_wraps,
clippy::format_push_string,
clippy::disallowed_macros
)]
mod tests {
use super::*;
use proptest::prelude::*;
fn create_test_parser() -> MarkdownParser {
MarkdownParser::new().expect("Failed to create parser")
}
#[test]
fn test_anchor_stability_when_section_moves() {
let mut parser = create_test_parser();
let doc_v1 = "# Intro\n\nPrelude.\n\n## Section A\n\nAlpha content line 1.\nAlpha content line 2.\n\n## Section B\n\nBeta content.\n";
let result_v1 = parser.parse(doc_v1).expect("parse v1");
#[allow(clippy::items_after_statements)]
fn find<'a>(entries: &'a [TocEntry], name: &str) -> Option<&'a TocEntry> {
for e in entries {
if e.heading_path.last().is_some_and(|h| h == name) {
return Some(e);
}
if let Some(found) = find(&e.children, name) {
return Some(found);
}
}
None
}
let a_v1 = find(&result_v1.toc, "Section A").expect("section A in v1");
let anchor_v1 = a_v1.anchor.clone().expect("anchor v1");
let lines_v1 = a_v1.lines.clone();
let doc_v2 = "# Intro\n\nPrelude.\n\n## Section B\n\nBeta content.\n\n## Section A\n\nAlpha content line 1.\nAlpha content line 2.\n";
let result_v2 = parser.parse(doc_v2).expect("parse v2");
let a_v2 = find(&result_v2.toc, "Section A").expect("section A in v2");
let anchor_v2 = a_v2.anchor.clone().expect("anchor v2");
let lines_v2 = a_v2.lines.clone();
assert_eq!(anchor_v1, anchor_v2, "anchor stable across moves");
assert_ne!(lines_v1, lines_v2, "lines should reflect new position");
}
#[test]
fn test_skips_placeholder_404_headings() -> Result<()> {
let mut parser = create_test_parser();
let doc = r"# 404
Check the URL.
## Actual Section
Real content lives here.
### Nested Detail
Additional context.
## Follow Up
More guidance.
";
let result = parser.parse(doc)?;
assert_eq!(
result.toc.len(),
2,
"top-level entries should ignore 404 headings"
);
assert!(
result.toc.iter().all(|entry| entry
.heading_path
.iter()
.all(|component| !component.starts_with("404"))),
"toc should not contain placeholder 404 entries"
);
assert_eq!(
result.heading_blocks.len(),
3,
"children under 404 should remain accessible"
);
assert_eq!(result.heading_blocks[0].path[0], "Actual Section");
Ok(())
}
fn simple_markdown() -> &'static str {
r"# Main Heading
This is some content under the main heading.
## Sub Heading
More content here.
### Deep Heading
Even deeper content.
## Another Sub
Final content.
"
}
fn complex_markdown() -> &'static str {
r#"# Getting Started
Welcome to our documentation!
## Installation
Run the following command:
```bash
npm install
```
### Requirements
- Node.js 16+
- npm 7+
## Usage
Here's how to use it:
1. First step
2. Second step
### Advanced Usage
For advanced users:
#### Configuration
Edit the config file:
```json
{
"key": "value"
}
```
## Troubleshooting
Common issues:
- Issue 1
- Issue 2
"#
}
fn malformed_markdown() -> &'static str {
r"# Broken Heading
## Missing content
### Unmatched brackets ][
Content with `unclosed code
> Broken quote
>> Nested broken quote
* List item
* Nested without proper spacing
* Another item
```
Unclosed code block
"
}
#[test]
fn test_parser_creation() {
let result = MarkdownParser::new();
assert!(result.is_ok());
}
#[test]
fn test_parse_simple_markdown() -> Result<()> {
let mut parser = create_test_parser();
let markdown = simple_markdown();
let result = parser.parse(markdown)?;
assert!(!result.heading_blocks.is_empty());
assert!(!result.toc.is_empty());
assert_eq!(result.line_count, markdown.lines().count());
let main_heading = result
.heading_blocks
.iter()
.find(|block| block.path.contains(&"Main Heading".to_string()));
assert!(main_heading.is_some());
let sub_heading = result
.heading_blocks
.iter()
.find(|block| block.path.contains(&"Sub Heading".to_string()));
assert!(sub_heading.is_some());
Ok(())
}
#[test]
fn test_parse_complex_markdown_structure() -> Result<()> {
let mut parser = create_test_parser();
let markdown = complex_markdown();
let result = parser.parse(markdown)?;
assert!(result.heading_blocks.len() >= 5);
let headings: Vec<_> = result
.heading_blocks
.iter()
.flat_map(|block| &block.path)
.collect();
assert!(headings.iter().any(|h| h.contains("Getting Started")));
assert!(headings.iter().any(|h| h.contains("Installation")));
assert!(headings.iter().any(|h| h.contains("Requirements")));
assert!(headings.iter().any(|h| h.contains("Configuration")));
assert!(!result.toc.is_empty());
let top_level = &result.toc[0];
assert!(
top_level
.heading_path
.contains(&"Getting Started".to_string())
);
Ok(())
}
#[test]
fn test_parse_malformed_markdown() -> Result<()> {
let mut parser = create_test_parser();
let markdown = malformed_markdown();
let result = parser.parse(markdown)?;
assert!(!result.heading_blocks.is_empty());
Ok(())
}
#[test]
fn test_parse_empty_document() -> Result<()> {
let mut parser = create_test_parser();
let empty = "";
let result = parser.parse(empty)?;
assert_eq!(result.line_count, 0);
assert!(result.heading_blocks.len() <= 1); assert!(
result
.diagnostics
.iter()
.any(|d| d.message.contains("No headings found")
|| d.severity == DiagnosticSeverity::Warn)
);
Ok(())
}
#[test]
fn test_parse_document_without_headings() -> Result<()> {
let mut parser = create_test_parser();
let no_headings = r"This is just plain text.
With multiple paragraphs.
And some more content.
But no headings at all.
";
let result = parser.parse(no_headings)?;
assert_eq!(result.heading_blocks.len(), 1);
let block = &result.heading_blocks[0];
assert_eq!(block.path, vec!["Document".to_string()]);
assert_eq!(block.content.trim(), no_headings.trim());
assert!(
result
.diagnostics
.iter()
.any(|d| d.message.contains("No headings found"))
);
Ok(())
}
#[test]
fn test_windowed_segmentation_for_large_unstructured() -> Result<()> {
let mut parser = create_test_parser();
let total = FALLBACK_WINDOW_LINES * 2 + 25; let doc = (1..=total)
.map(|i| format!("line {i}"))
.collect::<Vec<_>>()
.join("\n");
let result = parser.parse(&doc)?;
assert_eq!(result.heading_blocks.len(), 3);
for b in &result.heading_blocks {
assert_eq!(b.path, vec!["Document".to_string()]);
assert!(b.start_line >= 1);
assert!(b.end_line <= total);
}
assert_eq!(result.heading_blocks.last().unwrap().end_line, total);
Ok(())
}
#[test]
fn test_heading_level_detection() -> Result<()> {
let mut parser = create_test_parser();
let multilevel = r"# Level 1
## Level 2
### Level 3
#### Level 4
##### Level 5
###### Level 6
";
let result = parser.parse(multilevel)?;
assert!(result.heading_blocks.len() >= 6);
let paths: Vec<_> = result
.heading_blocks
.iter()
.map(|block| block.path.len())
.collect();
assert!(paths.contains(&1)); assert!(paths.contains(&2)); assert!(paths.iter().any(|&len| len >= 3));
Ok(())
}
#[test]
fn test_heading_text_extraction() -> Result<()> {
let mut parser = create_test_parser();
let formatted_headings = r"# **Bold Heading**
## _Italic Heading_
### `Code in Heading`
#### Heading with [Link](http://example.com)
##### Heading with **bold** and _italic_
";
let result = parser.parse(formatted_headings)?;
let heading_texts: Vec<_> = result
.heading_blocks
.iter()
.flat_map(|block| &block.path)
.collect();
assert!(heading_texts.iter().any(|h| h.contains("Bold Heading")));
assert!(heading_texts.iter().any(|h| h.contains("Italic Heading")));
assert!(heading_texts.iter().any(|h| h.contains("Code in Heading")));
Ok(())
}
#[test]
fn test_content_extraction() -> Result<()> {
let mut parser = create_test_parser();
let content_markdown = r"# Section A
This is content for section A.
It spans multiple lines.
## Subsection A1
More specific content here.
# Section B
Different content for section B.
";
let result = parser.parse(content_markdown)?;
let section_a = result
.heading_blocks
.iter()
.find(|block| block.path.contains(&"Section A".to_string()))
.expect("Section A should be found");
assert!(section_a.content.contains("This is content for section A"));
assert!(section_a.content.contains("multiple lines"));
let section_b = result
.heading_blocks
.iter()
.find(|block| block.path.contains(&"Section B".to_string()))
.expect("Section B should be found");
assert!(
section_b
.content
.contains("Different content for section B")
);
Ok(())
}
#[test]
fn test_line_number_tracking() -> Result<()> {
let mut parser = create_test_parser();
let numbered_content =
"Line 1\n# Heading at line 2\nLine 3\nLine 4\n## Sub at line 5\nLine 6";
let result = parser.parse(numbered_content)?;
assert_eq!(result.line_count, 6);
let heading_block = result
.heading_blocks
.iter()
.find(|block| block.path.contains(&"Heading at line 2".to_string()));
if let Some(block) = heading_block {
assert!(block.start_line >= 1);
assert!(block.end_line <= result.line_count);
assert!(block.start_line <= block.end_line);
}
Ok(())
}
#[test]
fn test_toc_generation() -> Result<()> {
let mut parser = create_test_parser();
let hierarchical = r"# Top Level
## First Sub
### Deep Sub 1
### Deep Sub 2
## Second Sub
### Another Deep
#### Very Deep
# Another Top
";
let result = parser.parse(hierarchical)?;
assert!(!result.toc.is_empty());
assert!(!result.toc.is_empty());
let first_top = &result.toc[0];
assert!(first_top.heading_path.contains(&"Top Level".to_string()));
if !first_top.children.is_empty() {
let first_sub = &first_top.children[0];
assert!(first_sub.heading_path.len() >= 2); }
Ok(())
}
proptest! {
#[test]
fn test_parser_never_panics_on_arbitrary_input(
content in prop::string::string_regex("[\\x20-\\x7E\\n\\r\\t]{0,500}").unwrap()
) {
let mut parser = create_test_parser();
let result = parser.parse(&content);
if let Ok(parse_result) = result {
prop_assert!(parse_result.line_count == content.lines().count());
prop_assert!(!parse_result.heading_blocks.is_empty()); } else {
}
}
#[test]
fn test_line_count_accuracy(
lines in prop::collection::vec(
prop::string::string_regex("[\\x20-\\x7E]{0,100}").unwrap(),
0..50
)
) {
let content = lines.join("\n");
let mut parser = create_test_parser();
let expected_lines = if content.is_empty() {
0
} else {
content.lines().count()
};
if let Ok(result) = parser.parse(&content) {
prop_assert_eq!(result.line_count, expected_lines);
}
}
#[test]
fn test_single_heading_parsing(heading_text in r"[a-zA-Z][a-zA-Z0-9 ]{2,30}") {
let mut parser = create_test_parser();
let markdown = format!("# {heading_text}");
let trimmed = heading_text.trim();
if trimmed.is_empty() || trimmed.len() < 2 {
return Ok(());
}
if let Ok(result) = parser.parse(&markdown) {
prop_assert!(!result.heading_blocks.is_empty());
if !result.toc.is_empty() {
let has_heading = result.heading_blocks.iter()
.any(|block| block.path.iter().any(|p| p.contains(trimmed)));
prop_assert!(has_heading);
}
}
}
#[test]
fn test_heading_level_detection_consistency(
levels in prop::collection::vec(1u8..=6, 1..10)
) {
let mut parser = create_test_parser();
let mut markdown = String::new();
let mut expected_path_lens = Vec::new();
for (i, level) in levels.iter().enumerate() {
let heading_text = format!("Heading {}", i + 1);
let heading_line = format!("{} {}\n\nContent for heading {}\n\n",
"#".repeat(*level as usize),
heading_text,
i + 1);
markdown.push_str(&heading_line);
expected_path_lens.push(*level as usize);
}
if let Ok(result) = parser.parse(&markdown) {
prop_assert!(result.heading_blocks.len() >= levels.len().min(1));
for (i, expected_depth) in expected_path_lens.iter().enumerate() {
if i < result.heading_blocks.len() {
let actual_depth = result.heading_blocks[i].path.len();
prop_assert!(actual_depth <= *expected_depth);
prop_assert!(actual_depth >= 1);
}
}
}
}
#[test]
fn test_unicode_content_preservation(
content in r"[\u{0080}-\u{FFFF}]{1,100}"
) {
let mut parser = create_test_parser();
let markdown = format!("# Unicode Test\n\n{content}");
if let Ok(result) = parser.parse(&markdown) {
let has_unicode = result.heading_blocks.iter()
.any(|block| block.content.contains(&content));
prop_assert!(has_unicode, "Unicode content should be preserved");
prop_assert_eq!(result.line_count, markdown.lines().count());
}
}
#[test]
fn test_mixed_line_endings(
line_ending in prop_oneof![Just("\n"), Just("\r\n"), Just("\r")]
) {
let mut parser = create_test_parser();
let content_lines = ["# Main Heading",
"",
"This is content.",
"",
"## Sub Heading",
"",
"More content here."];
let markdown = content_lines.join(line_ending);
if let Ok(result) = parser.parse(&markdown) {
prop_assert!(!result.heading_blocks.is_empty());
let main_heading = result.heading_blocks.iter()
.any(|block| block.path.iter().any(|p| p.contains("Main Heading")));
let sub_heading = result.heading_blocks.iter()
.any(|block| block.path.iter().any(|p| p.contains("Sub Heading")));
prop_assert!(main_heading || sub_heading, "Should find at least one heading");
}
}
#[test]
fn test_deeply_nested_structure(depth in 1usize..20) {
let mut parser = create_test_parser();
let mut markdown = String::new();
for level in 1..=depth.min(6) {
let heading = format!("{} Level {} Heading\n\nContent at level {}.\n\n",
"#".repeat(level), level, level);
markdown.push_str(&heading);
}
if let Ok(result) = parser.parse(&markdown) {
prop_assert!(!result.heading_blocks.is_empty());
prop_assert!(!result.toc.is_empty());
if let Some(deepest) = result.heading_blocks.iter()
.max_by_key(|block| block.path.len()) {
prop_assert!(deepest.path.len() <= depth.min(6));
}
}
}
#[test]
fn test_large_content_blocks(
block_size in 100usize..5000,
num_blocks in 1usize..10
) {
let mut parser = create_test_parser();
let mut markdown = String::new();
for i in 0..num_blocks {
markdown.push_str(&format!("# Heading {}\n\n", i + 1));
let content_line = format!("This is line {i} of content. ");
let large_content = content_line.repeat(block_size / content_line.len());
markdown.push_str(&large_content);
markdown.push_str("\n\n");
}
if let Ok(result) = parser.parse(&markdown) {
prop_assert_eq!(result.heading_blocks.len(), num_blocks);
for block in &result.heading_blocks {
prop_assert!(block.content.len() > block_size / 2);
}
prop_assert!(result.line_count >= num_blocks * 3); }
}
#[test]
fn test_markdown_syntax_edge_cases(
syntax_char in prop_oneof![
Just("*"), Just("_"), Just("`"), Just("~"),
Just("["), Just("]"), Just("("), Just(")"),
Just("!"), Just("#"), Just(">"), Just("-"),
Just("+"), Just("="), Just("|"), Just("\\")
]
) {
let mut parser = create_test_parser();
let markdown = format!(
"# Test Heading\n\nContent with {syntax_char} special {syntax_char} characters {syntax_char} here.\n\n## Another {syntax_char}\n\nMore {syntax_char} content."
);
if let Ok(result) = parser.parse(&markdown) {
prop_assert!(!result.heading_blocks.is_empty());
let has_special_chars = result.heading_blocks.iter()
.any(|block| block.content.contains(syntax_char));
prop_assert!(has_special_chars, "Special characters should be preserved");
}
}
#[test]
fn test_heading_with_formatting(
format_type in prop_oneof"),
Just("~~strike~~")
],
heading_text in r"[a-zA-Z ]{5,20}"
) {
let mut parser = create_test_parser();
let formatted_heading = format!("# {heading_text} {format_type}\n\nContent here.");
if let Ok(result) = parser.parse(&formatted_heading) {
prop_assert!(!result.heading_blocks.is_empty());
let heading_found = result.heading_blocks.iter()
.any(|block| block.path.iter()
.any(|p| p.contains(heading_text.trim())));
prop_assert!(heading_found, "Should find heading text");
}
}
#[test]
fn test_random_whitespace_patterns(
spaces_before in 0usize..4, spaces_after in 0usize..10,
tabs_mixed in 0usize..5
) {
let mut parser = create_test_parser();
let whitespace_prefix = " ".repeat(spaces_before); let whitespace_suffix = format!("{}{}",
" ".repeat(spaces_after),
"\t".repeat(tabs_mixed));
let markdown = format!("{whitespace_prefix}# Test Heading{whitespace_suffix}\n\nContent here.");
if let Ok(result) = parser.parse(&markdown) {
prop_assert!(!result.heading_blocks.is_empty());
let found_heading = result.heading_blocks.iter()
.any(|block| block.path.iter()
.any(|p| p.contains("Test Heading")));
prop_assert!(found_heading, "Should find heading with {} spaces before", spaces_before);
}
}
#[test]
fn test_content_with_code_blocks(
language in prop_oneof![
Just("rust"), Just("javascript"), Just("python"),
Just("bash"), Just("json"), Just("")
],
code_lines in prop::collection::vec(r"[a-zA-Z0-9 ]{0,50}", 1..10)
) {
let mut parser = create_test_parser();
let code_content = code_lines.join("\n");
let markdown = format!(
"# Code Example\n\nHere's some code:\n\n```{language}\n{code_content}\n```\n\n## After Code\n\nMore content."
);
if let Ok(result) = parser.parse(&markdown) {
prop_assert!(!result.heading_blocks.is_empty());
let has_code = result.heading_blocks.iter()
.any(|block| block.content.contains(&code_content));
prop_assert!(has_code, "Code content should be preserved");
let headings: Vec<_> = result.heading_blocks.iter()
.flat_map(|block| &block.path)
.collect();
let has_main = headings.iter().any(|h| h.contains("Code Example"));
let has_after = headings.iter().any(|h| h.contains("After Code"));
prop_assert!(has_main || has_after, "Should find at least one heading");
}
}
}
#[test]
fn test_parser_handles_malicious_markdown() -> Result<()> {
let malicious_inputs = vec![
format!("# {}", "A".repeat(10000)),
(1..=100)
.map(|i| format!("{} Level {}", "#".repeat(i % 6 + 1), i))
.collect::<Vec<_>>()
.join("\n"),
"# \u{202e}reversed\u{202d} heading".to_string(),
"# Heading with \x00 null \x01 characters".to_string(),
format!(
"# Top\n{}",
(2..=50)
.map(|i| format!("{} Level {}", "#".repeat(i), i))
.collect::<Vec<_>>()
.join("\n")
),
"# Heading 1\r\n## Heading 2\n### Heading 3\r#### Heading 4".to_string(),
];
let mut parser = create_test_parser();
for malicious_input in malicious_inputs {
let result = parser.parse(&malicious_input);
if let Ok(parse_result) = result {
assert!(parse_result.line_count <= malicious_input.lines().count() + 1);
assert!(!parse_result.heading_blocks.is_empty());
} else {
}
}
Ok(())
}
#[test]
fn test_parser_handles_unicode_content() -> Result<()> {
let unicode_markdown = r"# 日本語のヘッダー
これは日本語のコンテンツです。
## العنوان العربي
محتوى باللغة العربية.
### Заголовок на русском
Русский контент.
#### 🚀 Emoji Header 🎉
Content with emojis: 😀 🎈 🌟
##### Mixed: English 中文 العربية русский
";
let mut parser = create_test_parser();
let result = parser.parse(unicode_markdown)?;
assert!(!result.heading_blocks.is_empty());
assert!(!result.toc.is_empty());
let all_paths: Vec<_> = result
.heading_blocks
.iter()
.flat_map(|block| &block.path)
.collect();
assert!(all_paths.iter().any(|p| p.contains("日本語")));
assert!(all_paths.iter().any(|p| p.contains("العربي")));
assert!(all_paths.iter().any(|p| p.contains("русском")));
assert!(all_paths.iter().any(|p| p.contains("🚀")));
Ok(())
}
#[test]
fn test_parser_memory_efficiency() -> Result<()> {
let large_doc = format!(
"# Main\n\n{}\n\n## Sub\n\n{}",
"Content line.\n".repeat(1000),
"More content.\n".repeat(1000)
);
let mut parser = create_test_parser();
let result = parser.parse(&large_doc)?;
assert!(!result.heading_blocks.is_empty());
assert_eq!(result.line_count, large_doc.lines().count());
let main_block = result
.heading_blocks
.iter()
.find(|block| block.path.contains(&"Main".to_string()));
assert!(main_block.is_some());
Ok(())
}
#[test]
fn test_parser_edge_cases() -> Result<()> {
let edge_cases = vec![
" \n\t\n ",
"# A\n## B\n### C\n#### D",
"# !!!\n## ???\n### ***",
"#\n##\n###",
"# Heading \n## Another ",
"# ATX Style\nSetext Style\n============",
];
let mut parser = create_test_parser();
for edge_case in edge_cases {
let result = parser.parse(edge_case);
match result {
Ok(parse_result) => {
assert!(parse_result.line_count == edge_case.lines().count());
assert!(!parse_result.heading_blocks.is_empty()); },
Err(e) => {
assert!(e.to_string().contains("parse") || e.to_string().contains("Parse"));
},
}
}
Ok(())
}
#[test]
fn test_diagnostic_generation() -> Result<()> {
let problematic_markdown = r"Some content without headings
More content here
And even more content
";
let mut parser = create_test_parser();
let result = parser.parse(problematic_markdown)?;
assert!(!result.diagnostics.is_empty());
let warning_diagnostic = result.diagnostics.iter().find(|d| {
matches!(d.severity, DiagnosticSeverity::Warn) && d.message.contains("No headings")
});
assert!(warning_diagnostic.is_some());
Ok(())
}
#[test]
fn test_parser_consistency() -> Result<()> {
let mut parser = create_test_parser();
let markdown = simple_markdown();
let result1 = parser.parse(markdown)?;
let result2 = parser.parse(markdown)?;
assert_eq!(result1.heading_blocks.len(), result2.heading_blocks.len());
assert_eq!(result1.toc.len(), result2.toc.len());
assert_eq!(result1.line_count, result2.line_count);
for (block1, block2) in result1
.heading_blocks
.iter()
.zip(result2.heading_blocks.iter())
{
assert_eq!(block1.path, block2.path);
assert_eq!(block1.start_line, block2.start_line);
assert_eq!(block1.end_line, block2.end_line);
}
Ok(())
}
#[test]
#[allow(clippy::similar_names)] fn test_heading_blocks_no_duplication() -> Result<()> {
let markdown = r"# First Heading
SENTINEL_FIRST_START
Content under first heading
with multiple lines
SENTINEL_FIRST_END
## First Sub
SENTINEL_SUB_START
Content under first sub
SENTINEL_SUB_END
## Second Sub
SENTINEL_SUB2_START
Content under second sub
SENTINEL_SUB2_END
# Second Heading
SENTINEL_SECOND_START
Final content
SENTINEL_SECOND_END";
let mut parser = create_test_parser();
let result = parser.parse(markdown)?;
assert_eq!(
result.heading_blocks.len(),
4,
"Should have 4 heading blocks"
);
for block in &result.heading_blocks {
let first_count = block.content.matches("SENTINEL_FIRST_START").count();
let sub_count = block.content.matches("SENTINEL_SUB_START").count();
let sub2_count = block.content.matches("SENTINEL_SUB2_START").count();
let second_count = block.content.matches("SENTINEL_SECOND_START").count();
assert!(first_count <= 1, "First sentinel duplicated");
assert!(sub_count <= 1, "Sub sentinel duplicated");
assert!(sub2_count <= 1, "Sub2 sentinel duplicated");
assert!(second_count <= 1, "Second sentinel duplicated");
}
let first_block = &result.heading_blocks[0];
assert!(first_block.content.contains("SENTINEL_FIRST_START"));
assert!(first_block.content.contains("SENTINEL_FIRST_END"));
assert!(!first_block.content.contains("SENTINEL_SUB_START"));
let sub_block = &result.heading_blocks[1];
assert!(sub_block.content.contains("SENTINEL_SUB_START"));
assert!(sub_block.content.contains("SENTINEL_SUB_END"));
assert!(!sub_block.content.contains("SENTINEL_FIRST"));
assert!(!sub_block.content.contains("SENTINEL_SUB2"));
let sub2_block = &result.heading_blocks[2];
assert!(sub2_block.content.contains("SENTINEL_SUB2_START"));
assert!(sub2_block.content.contains("SENTINEL_SUB2_END"));
assert!(!sub2_block.content.contains("SENTINEL_SUB_START"));
assert!(!sub2_block.content.contains("SENTINEL_SECOND"));
let second_block = &result.heading_blocks[3];
assert!(second_block.content.contains("SENTINEL_SECOND_START"));
assert!(second_block.content.contains("SENTINEL_SECOND_END"));
assert!(!second_block.content.contains("SENTINEL_FIRST"));
assert!(!second_block.content.contains("SENTINEL_SUB"));
Ok(())
}
#[test]
fn test_line_ranges_accuracy() -> Result<()> {
let markdown = "# Heading at Line 1\nLine 2\nLine 3\nLine 4\nLine 5\n## Sub at Line 6\nLine 7\nLine 8\n# Another at Line 9\nLine 10";
let mut parser = create_test_parser();
let result = parser.parse(markdown)?;
assert_eq!(result.line_count, 10, "Should have 10 lines total");
assert_eq!(
result.heading_blocks.len(),
3,
"Should have 3 heading blocks"
);
let first = &result.heading_blocks[0];
assert_eq!(first.path, vec!["Heading at Line 1"]);
assert_eq!(first.start_line, 1, "First heading starts at line 1");
assert_eq!(first.end_line, 5, "First heading ends at line 5");
let second = &result.heading_blocks[1];
assert_eq!(second.path, vec!["Heading at Line 1", "Sub at Line 6"]);
assert_eq!(second.start_line, 6, "Sub heading starts at line 6");
assert_eq!(second.end_line, 8, "Sub heading ends at line 8");
let third = &result.heading_blocks[2];
assert_eq!(third.path, vec!["Another at Line 9"]);
assert_eq!(third.start_line, 9, "Another heading starts at line 9");
assert_eq!(third.end_line, 10, "Another heading ends at line 10");
Ok(())
}
#[test]
fn test_unicode_mixed_headings_edge_cases() -> Result<()> {
let markdown = r"# 🔥 Main Section
Content with emoji
## Ünïcödë Heading
Спецйальные символы
### Deep → Nested ← Section
More content here
#### Even Deeper
Nested content
##### Fifth Level
Very deep
###### Sixth Level
Deepest level
### Back to Level 3
After deep nesting";
let mut parser = create_test_parser();
let result = parser.parse(markdown)?;
assert!(
result.heading_blocks.len() >= 7,
"Should extract all heading levels"
);
assert!(result.heading_blocks[0].path[0].contains("🔥"));
assert!(result.heading_blocks[1].path[1].contains("Ünïcödë"));
let deep_block = result
.heading_blocks
.iter()
.find(|b| b.path.last().is_some_and(|p| p.contains("Fifth Level")))
.expect("Should find Fifth Level heading");
assert!(
deep_block.path.len() >= 5,
"Fifth level should be deeply nested"
);
let back_block = result
.heading_blocks
.iter()
.find(|b| b.path.last().is_some_and(|p| p.contains("Back to Level 3")))
.expect("Should find Back to Level 3 heading");
assert_eq!(
back_block.path.len(),
3,
"Should be at level 3 after backtracking"
);
Ok(())
}
}