use anyhow::Result;
use tree_sitter::Node;
use tree_sitter_md::MarkdownParser;
use crate::index::format::{SymbolEntry, TextEntry};
use crate::parser::helpers::{node_line_range, node_text, push_symbol};
pub fn parse_and_extract(
source: &[u8],
file_path: &str,
) -> Result<(Vec<SymbolEntry>, Vec<TextEntry>)> {
let mut parser = MarkdownParser::default();
let md_tree = parser
.parse(source, None)
.ok_or_else(|| anyhow::anyhow!("failed to parse markdown: {}", file_path))?;
let mut symbols = Vec::new();
let mut texts = Vec::new();
let block_tree = md_tree.block_tree();
let root = block_tree.root_node();
let mut heading_stack: Vec<(u32, String)> = Vec::new();
walk_blocks(
root,
source,
file_path,
&mut heading_stack,
&mut symbols,
&mut texts,
);
Ok((symbols, texts))
}
fn walk_blocks(
node: Node,
source: &[u8],
file_path: &str,
heading_stack: &mut Vec<(u32, String)>,
symbols: &mut Vec<SymbolEntry>,
texts: &mut Vec<TextEntry>,
) {
match node.kind() {
"atx_heading" => {
extract_atx_heading(node, source, file_path, heading_stack, symbols);
}
"setext_heading" => {
extract_setext_heading(node, source, file_path, heading_stack, symbols);
}
"fenced_code_block" => {
extract_code_block(node, source, file_path, heading_stack, texts);
}
_ => {
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
walk_blocks(child, source, file_path, heading_stack, symbols, texts);
}
}
}
}
fn extract_atx_heading(
node: Node,
source: &[u8],
file_path: &str,
heading_stack: &mut Vec<(u32, String)>,
symbols: &mut Vec<SymbolEntry>,
) {
let level = count_atx_level(node, source);
let text = get_heading_text(node, source);
if text.is_empty() {
return;
}
let line_range = node_line_range(node);
let (qualified_name, parent) = compute_qualified_name(heading_stack, level, &text);
push_symbol(
symbols,
file_path,
qualified_name,
"section",
line_range,
parent.as_deref(),
None,
None,
None,
);
}
fn extract_setext_heading(
node: Node,
source: &[u8],
file_path: &str,
heading_stack: &mut Vec<(u32, String)>,
symbols: &mut Vec<SymbolEntry>,
) {
let level = get_setext_level(node, source);
let text = get_heading_text(node, source);
if text.is_empty() {
return;
}
let line_range = node_line_range(node);
let (qualified_name, parent) = compute_qualified_name(heading_stack, level, &text);
push_symbol(
symbols,
file_path,
qualified_name,
"section",
line_range,
parent.as_deref(),
None,
None,
None,
);
}
fn extract_code_block(
node: Node,
source: &[u8],
file_path: &str,
heading_stack: &[(u32, String)],
texts: &mut Vec<TextEntry>,
) {
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
if child.kind() == "code_fence_content" {
let text = node_text(child, source);
if text.trim().is_empty() {
return;
}
let line_range = node_line_range(node);
let parent = heading_stack.last().map(|(_, name)| name.clone());
texts.push(TextEntry {
file: file_path.to_string(),
kind: "sample".to_string(),
line: line_range,
text,
parent,
project: String::new(),
});
return;
}
}
let raw = node_text(node, source);
if let Some(content) = extract_code_content(&raw) {
if content.trim().is_empty() {
return;
}
let line_range = node_line_range(node);
let parent = heading_stack.last().map(|(_, name)| name.clone());
texts.push(TextEntry {
file: file_path.to_string(),
kind: "sample".to_string(),
line: line_range,
text: content,
parent,
project: String::new(),
});
}
}
fn extract_code_content(raw: &str) -> Option<String> {
let lines: Vec<&str> = raw.lines().collect();
if lines.len() < 2 {
return None;
}
let first = lines.first()?;
if !first.trim_start().starts_with("```") && !first.trim_start().starts_with("~~~") {
return None;
}
let mut end_idx = lines.len();
for (i, line) in lines.iter().enumerate().skip(1) {
let trimmed = line.trim_start();
if trimmed.starts_with("```") || trimmed.starts_with("~~~") {
end_idx = i;
break;
}
}
let content = lines.get(1..end_idx).map(|slice| slice.join("\n"))?;
Some(content)
}
fn get_heading_text(node: Node, source: &[u8]) -> String {
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
match child.kind() {
"heading_content" | "inline" => {
let text = node_text(child, source);
return strip_optional_closing_hashes(text.trim());
}
"paragraph" => {
let text = node_text(child, source);
return text.trim().to_string();
}
_ => {}
}
}
let raw = node_text(node, source);
strip_heading_markers(&raw)
}
fn strip_optional_closing_hashes(text: &str) -> String {
let trimmed = text.trim_end();
if let Some((last_space_idx, last_space_char)) =
trimmed.char_indices().rfind(|(_, c)| c.is_whitespace())
{
let after_space_start = last_space_idx + last_space_char.len_utf8();
let after_space = trimmed.get(after_space_start..).unwrap_or("");
if !after_space.is_empty() && after_space.chars().all(|c| c == '#') {
return trimmed
.get(..last_space_idx)
.unwrap_or(trimmed)
.trim()
.to_string();
}
}
trimmed.to_string()
}
fn strip_heading_markers(raw: &str) -> String {
let trimmed = raw.trim();
if trimmed.starts_with('#') {
let first_line = trimmed.lines().next().unwrap_or(trimmed);
let after_hashes = first_line.trim_start_matches('#');
let text = after_hashes.trim_end_matches('#').trim();
return text.to_string();
}
if let Some(first_line) = trimmed.lines().next() {
return first_line.trim().to_string();
}
trimmed.to_string()
}
fn count_atx_level(node: Node, source: &[u8]) -> u32 {
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
let kind = child.kind();
if kind.starts_with("atx_h") && kind.ends_with("_marker") {
if let Some(level_char) = kind.chars().nth(5)
&& let Some(level) = level_char.to_digit(10)
{
return level;
}
}
}
let text = node_text(node, source);
let count = text.chars().take_while(|&c| c == '#').count();
count.clamp(1, 6) as u32
}
fn get_setext_level(node: Node, source: &[u8]) -> u32 {
let mut cursor = node.walk();
for child in node.children(&mut cursor) {
match child.kind() {
"setext_h1_underline" => return 1,
"setext_h2_underline" => return 2,
_ => {}
}
}
let text = node_text(node, source);
if let Some(last_line) = text.lines().last() {
let trimmed = last_line.trim();
if trimmed.starts_with('=') {
return 1;
} else if trimmed.starts_with('-') {
return 2;
}
}
1 }
fn compute_qualified_name(
heading_stack: &mut Vec<(u32, String)>,
level: u32,
name: &str,
) -> (String, Option<String>) {
while let Some(&(top_level, _)) = heading_stack.last() {
if top_level >= level {
heading_stack.pop();
} else {
break;
}
}
let parent = heading_stack.last().map(|(_, qname)| qname.clone());
let qualified_name = match &parent {
Some(parent_name) => format!("{}/{}", parent_name, name),
None => name.to_string(),
};
heading_stack.push((level, qualified_name.clone()));
(qualified_name, parent)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_atx_headings_basic() {
let source = b"# Top Level\n\nSome content.\n\n## Sub Section\n\nMore content.\n";
let (symbols, _) = parse_and_extract(source, "test.md").unwrap();
assert_eq!(symbols.len(), 2);
assert_eq!(symbols[0].name, "Top Level");
assert_eq!(symbols[0].kind, "section");
assert_eq!(symbols[0].parent, None);
assert_eq!(symbols[1].name, "Top Level/Sub Section");
assert_eq!(symbols[1].kind, "section");
assert_eq!(symbols[1].parent, Some("Top Level".to_string()));
}
#[test]
fn test_atx_headings_hierarchy() {
let source =
b"# Chapter 1\n## Section A\n### Detail 1\n### Detail 2\n## Section B\n# Chapter 2\n";
let (symbols, _) = parse_and_extract(source, "test.md").unwrap();
assert_eq!(symbols.len(), 6);
assert_eq!(symbols[0].name, "Chapter 1");
assert_eq!(symbols[0].parent, None);
assert_eq!(symbols[1].name, "Chapter 1/Section A");
assert_eq!(symbols[1].parent, Some("Chapter 1".to_string()));
assert_eq!(symbols[2].name, "Chapter 1/Section A/Detail 1");
assert_eq!(symbols[2].parent, Some("Chapter 1/Section A".to_string()));
assert_eq!(symbols[3].name, "Chapter 1/Section A/Detail 2");
assert_eq!(symbols[3].parent, Some("Chapter 1/Section A".to_string()));
assert_eq!(symbols[4].name, "Chapter 1/Section B");
assert_eq!(symbols[4].parent, Some("Chapter 1".to_string()));
assert_eq!(symbols[5].name, "Chapter 2");
assert_eq!(symbols[5].parent, None);
}
#[test]
fn test_setext_headings() {
let source = b"Heading One\n===========\n\nSome text.\n\nHeading Two\n-----------\n";
let (symbols, _) = parse_and_extract(source, "test.md").unwrap();
assert!(symbols.len() >= 2);
assert_eq!(symbols[0].name, "Heading One");
assert_eq!(symbols[0].kind, "section");
assert_eq!(symbols[1].name, "Heading One/Heading Two");
assert_eq!(symbols[1].parent, Some("Heading One".to_string()));
}
#[test]
fn test_code_blocks() {
let source = b"# Setup\n\n```rust\nfn main() {\n println!(\"Hello\");\n}\n```\n";
let (symbols, texts) = parse_and_extract(source, "test.md").unwrap();
assert_eq!(symbols.len(), 1);
assert_eq!(symbols[0].name, "Setup");
assert!(!texts.is_empty());
assert_eq!(texts[0].kind, "sample");
assert!(texts[0].text.contains("fn main"));
assert_eq!(texts[0].parent, Some("Setup".to_string()));
}
#[test]
fn test_mixed_heading_styles() {
let source = b"Main Title\n==========\n\n## ATX Subsection\n\nContent here.\n";
let (symbols, _) = parse_and_extract(source, "test.md").unwrap();
assert_eq!(symbols.len(), 2);
assert_eq!(symbols[0].name, "Main Title");
assert_eq!(symbols[1].name, "Main Title/ATX Subsection");
assert_eq!(symbols[1].parent, Some("Main Title".to_string()));
}
#[test]
fn test_heading_with_trailing_hashes() {
let source = b"# Heading with trailing ### #\n";
let (symbols, _) = parse_and_extract(source, "test.md").unwrap();
assert_eq!(symbols.len(), 1);
assert_eq!(symbols[0].name, "Heading with trailing ###");
}
#[test]
fn test_heading_with_only_closing_hashes() {
let source = b"# Simple heading ##\n";
let (symbols, _) = parse_and_extract(source, "test.md").unwrap();
assert_eq!(symbols.len(), 1);
assert_eq!(symbols[0].name, "Simple heading");
}
#[test]
fn test_heading_with_non_breaking_space() {
let source = "# Prototype in\u{a0}a nutshell\n".as_bytes();
let (symbols, _) = parse_and_extract(source, "test.md").unwrap();
assert_eq!(symbols.len(), 1);
assert_eq!(symbols[0].name, "Prototype in\u{a0}a nutshell");
}
#[test]
fn test_heading_with_non_breaking_space_and_closing_hashes() {
let source = "# Title with\u{a0}nbsp ##\n".as_bytes();
let (symbols, _) = parse_and_extract(source, "test.md").unwrap();
assert_eq!(symbols.len(), 1);
assert_eq!(symbols[0].name, "Title with\u{a0}nbsp");
}
#[test]
fn test_empty_heading_skipped() {
let source = b"# \n## Real Heading\n";
let (symbols, _) = parse_and_extract(source, "test.md").unwrap();
assert!(symbols.iter().all(|s| !s.name.is_empty()));
assert!(symbols.iter().any(|s| s.name == "Real Heading"));
}
#[test]
fn test_code_block_without_heading() {
let source = b"```python\nprint('hello')\n```\n";
let (symbols, texts) = parse_and_extract(source, "test.md").unwrap();
assert!(symbols.is_empty());
assert!(!texts.is_empty());
assert_eq!(texts[0].kind, "sample");
assert_eq!(texts[0].parent, None); }
#[test]
fn test_deeply_nested_headings() {
let source = b"# H1\n## H2\n### H3\n#### H4\n##### H5\n###### H6\n";
let (symbols, _) = parse_and_extract(source, "test.md").unwrap();
assert_eq!(symbols.len(), 6);
assert_eq!(symbols[5].name, "H1/H2/H3/H4/H5/H6");
assert_eq!(symbols[5].parent, Some("H1/H2/H3/H4/H5".to_string()));
}
#[test]
fn test_duplicate_heading_names() {
let source = b"# v1.0\n## Features\n## Bug Fixes\n# v2.0\n## Features\n## Bug Fixes\n";
let (symbols, _) = parse_and_extract(source, "test.md").unwrap();
assert_eq!(symbols.len(), 6);
assert_eq!(symbols[0].name, "v1.0");
assert_eq!(symbols[1].name, "v1.0/Features");
assert_eq!(symbols[2].name, "v1.0/Bug Fixes");
assert_eq!(symbols[3].name, "v2.0");
assert_eq!(symbols[4].name, "v2.0/Features");
assert_eq!(symbols[5].name, "v2.0/Bug Fixes");
}
#[test]
fn test_code_block_with_qualified_parent() {
let source = b"# Guide\n## Installation\n```bash\nnpm install\n```\n";
let (symbols, texts) = parse_and_extract(source, "test.md").unwrap();
assert_eq!(symbols.len(), 2);
assert_eq!(symbols[0].name, "Guide");
assert_eq!(symbols[1].name, "Guide/Installation");
assert_eq!(texts.len(), 1);
assert_eq!(texts[0].kind, "sample");
assert_eq!(texts[0].parent, Some("Guide/Installation".to_string()));
}
}