use anyhow::{Context, Result};
use tree_sitter::{Node, Parser, Query, QueryCursor};
use crate::syntax::parser::{CodeChunk, SyntaxParser};
const MAX_CHUNK_LINES: usize = 500;
pub struct MarkdownParser {
parser: Parser,
query: Query,
}
impl MarkdownParser {
pub fn new() -> Self {
let mut parser = Parser::new();
let language = tree_sitter_md::language();
parser
.set_language(&language)
.expect("Error loading Markdown (md) grammar");
let query = Query::new(
&language,
r#"
[
(atx_heading) @item
(setext_heading) @item
(fenced_code_block) @item
(list_item) @item
(paragraph) @item ; Capture all paragraphs initially
]
"#,
)
.expect("Error creating Markdown (md) query");
MarkdownParser { parser, query }
}
fn node_to_chunk(
&self,
node: Node,
code: &str,
file_path: &str,
) -> Option<CodeChunk> {
let start_byte = node.start_byte();
let end_byte = node.end_byte();
let content = code.get(start_byte..end_byte)?.to_string();
let start_line = node.start_position().row + 1;
let end_line = node.end_position().row + 1;
let node_kind = node.kind();
let element_type = match node_kind {
"atx_heading" => {
if content.starts_with("###") {
"h3"
} else if content.starts_with("##") {
"h2"
} else if content.starts_with('#') {
"h1"
} else {
"heading"
}
},
"setext_heading" => {
if content.contains("\n===") {
"h1"
} else if content.contains("\n---") {
"h2"
} else {
"heading"
}
},
"paragraph" => "paragraph",
"fenced_code_block" => "code_block",
"list_item" => "list_item",
_ => "unknown",
};
Some(CodeChunk {
content,
file_path: file_path.to_string(),
start_line,
end_line,
language: "markdown".to_string(),
element_type: element_type.to_string(),
})
}
}
impl SyntaxParser for MarkdownParser {
fn parse(&mut self, code: &str, file_path: &str) -> Result<Vec<CodeChunk>> {
let tree = self
.parser
.parse(code, None)
.context("Failed to parse Markdown code")?;
let root_node = tree.root_node();
let mut potential_chunks = Vec::new();
let mut cursor = QueryCursor::new();
let code_bytes = code.as_bytes();
let matches = cursor.matches(&self.query, root_node, code_bytes);
for mat in matches {
for capture in mat.captures {
let capture_name = self.query.capture_names()[capture.index as usize];
if capture_name == "item" {
if let Some(chunk) = self.node_to_chunk(capture.node, code, file_path) {
potential_chunks.push((capture.node, chunk)); }
}
}
}
let mut chunks: Vec<CodeChunk> = potential_chunks
.into_iter()
.filter_map(|(node, chunk)| {
if let Some(parent) = node.parent() {
if chunk.element_type == "paragraph" && parent.kind() == "list_item" {
return None;
}
if chunk.element_type == "list_item" && parent.kind() == "list" {
if let Some(grandparent) = parent.parent() {
if grandparent.kind() == "list_item" {
return None;
}
}
}
}
Some(chunk)
})
.collect();
if chunks.is_empty() && !code.trim().is_empty() {
log::debug!(
"No specific Markdown elements found or only whitespace in {}. Applying line-based fallback chunking.",
file_path
);
let lines: Vec<&str> = code.lines().collect();
let mut current_line_start = 1;
for (i, line_chunk) in lines.chunks(MAX_CHUNK_LINES).enumerate() {
let content = line_chunk.join("\n");
let start_line = current_line_start;
let end_line = start_line + line_chunk.len() - 1;
chunks.push(CodeChunk {
content,
file_path: file_path.to_string(),
start_line,
end_line,
language: "markdown".to_string(), element_type: format!("fallback_line_chunk_{}", i), });
current_line_start = end_line + 1;
}
}
Ok(chunks)
}
}