use regex::Regex;
use tree_sitter::{Node, Parser};
use super::common::HierarchyTracker;
use crate::indexer::SymbolChunk;
use crate::profile_scope;
#[allow(dead_code)]
fn lang_markdown() -> tree_sitter::Language {
tree_sitter_md::language()
}
pub(super) fn extract_markdown_chunks(source: &str) -> Vec<SymbolChunk> {
profile_scope!("extract_markdown_chunks");
let mut parser = Parser::new();
parser.set_language(&lang_markdown()).ok();
let tree = match parser.parse(source, None) {
Some(t) => t,
None => return Vec::new(),
};
let mut chunks = Vec::new();
let root = tree.root_node();
let mut hierarchy = HierarchyTracker::new();
walk_markdown_nodes(source, root, &mut chunks, &mut hierarchy);
extract_markdown_links(source, &mut chunks);
chunks
}
fn walk_markdown_nodes(
source: &str,
node: Node,
chunks: &mut Vec<SymbolChunk>,
hierarchy: &mut HierarchyTracker,
) {
let kind = node.kind();
match kind {
"atx_heading" => {
extract_heading(source, node, chunks, hierarchy);
}
"fenced_code_block" => {
extract_code_block(source, node, chunks, hierarchy);
}
"pipe_table" => {
extract_table(source, node, chunks);
}
"list" => {
extract_list(source, node, chunks);
}
_ => {}
}
for i in 0..node.child_count() {
if let Some(child) = node.child(i) {
walk_markdown_nodes(source, child, chunks, hierarchy);
}
}
}
fn extract_heading(
source: &str,
node: Node,
chunks: &mut Vec<SymbolChunk>,
hierarchy: &mut HierarchyTracker,
) {
let mut level = 0;
let mut heading_text = String::new();
for i in 0..node.child_count() {
if let Some(child) = node.child(i) {
match child.kind() {
"atx_h1_marker" => level = 1,
"atx_h2_marker" => level = 2,
"atx_h3_marker" => level = 3,
"atx_h4_marker" => level = 4,
"atx_h5_marker" => level = 5,
"atx_h6_marker" => level = 6,
"inline" => {
if let Ok(text) = child.utf8_text(source.as_bytes()) {
heading_text = text.trim().to_string();
}
}
_ => {}
}
}
}
if level > 0 && !heading_text.is_empty() {
let start_line = (node.start_position().row + 1) as i32;
let end_line = find_section_end(source, node, level);
let parent_path = hierarchy.enter_heading(level as u8, heading_text.clone());
let kind = format!("heading_{}", level);
chunks.push(SymbolChunk {
symbol_name: Some(heading_text),
kind,
signature: None,
docstring: None,
start_line,
end_line,
metadata: Some(serde_json::json!({
"level": level,
"parent_path": parent_path
})),
});
}
}
fn find_section_end(source: &str, heading_node: Node, heading_level: usize) -> i32 {
let start_row = heading_node.start_position().row;
let lines: Vec<&str> = source.lines().collect();
let mut end_idx = start_row + 1;
let mut in_code_block = false;
while end_idx < lines.len() {
let line = lines[end_idx];
if line.trim().starts_with("```") {
in_code_block = !in_code_block;
end_idx += 1;
continue;
}
if !in_code_block {
if let Some(next_level) = get_heading_level_from_line(line) {
if next_level <= heading_level {
return end_idx as i32;
}
}
}
end_idx += 1;
}
lines.len() as i32
}
fn get_heading_level_from_line(line: &str) -> Option<usize> {
let trimmed = line.trim_start();
if !trimmed.starts_with('#') {
return None;
}
let mut level = 0;
for ch in trimmed.chars() {
if ch == '#' {
level += 1;
if level > 6 {
return None; }
} else if ch == ' ' {
return Some(level);
} else {
return None;
}
}
None
}
fn extract_code_block(
source: &str,
node: Node,
chunks: &mut Vec<SymbolChunk>,
hierarchy: &HierarchyTracker,
) {
let mut language: Option<String> = None;
let mut code_lines_count = 0;
for i in 0..node.child_count() {
if let Some(child) = node.child(i) {
match child.kind() {
"info_string" => {
if let Ok(text) = child.utf8_text(source.as_bytes()) {
let lang_text = text.split_whitespace().next().unwrap_or(text.trim());
language = Some(lang_text.to_string());
}
}
"code_fence_content" => {
if let Ok(text) = child.utf8_text(source.as_bytes()) {
code_lines_count = text.lines().count();
}
}
_ => {}
}
}
}
let start_line = (node.start_position().row + 1) as i32;
let end_line = (node.end_position().row + 1) as i32;
let parent_path = hierarchy.get_current_path();
let symbol_name = if let Some(ref lang) = language {
format!("Code: {}", lang)
} else {
"Code: plain".to_string()
};
chunks.push(SymbolChunk {
symbol_name: Some(symbol_name),
kind: "code_block".to_string(),
signature: None,
docstring: None,
start_line,
end_line,
metadata: Some(serde_json::json!({
"language": language.unwrap_or_else(|| "plain".to_string()),
"parent_path": parent_path,
"lines_of_code": code_lines_count
})),
});
}
fn extract_table(_source: &str, node: Node, chunks: &mut Vec<SymbolChunk>) {
let start_line = (node.start_position().row + 1) as i32;
let end_line = (node.end_position().row + 1) as i32;
let mut row_count = 0;
let mut column_count = 0;
let mut has_header = false;
for i in 0..node.child_count() {
if let Some(child) = node.child(i) {
match child.kind() {
"pipe_table_header" => {
has_header = true;
row_count += 1;
for j in 0..child.child_count() {
if let Some(cell) = child.child(j) {
if cell.kind() == "pipe_table_cell" {
column_count += 1;
}
}
}
}
"pipe_table_row" => {
row_count += 1;
}
_ => {}
}
}
}
chunks.push(SymbolChunk {
symbol_name: Some(format!("Table {}x{}", row_count, column_count)),
kind: "markdown_section".to_string(),
signature: None,
docstring: None,
start_line,
end_line,
metadata: Some(serde_json::json!({
"section_type": "table",
"rows": row_count,
"columns": column_count,
"has_header": has_header
})),
});
}
fn extract_list(_source: &str, node: Node, chunks: &mut Vec<SymbolChunk>) {
let start_line = (node.start_position().row + 1) as i32;
let end_line = (node.end_position().row + 1) as i32;
let mut list_type = "unordered";
let mut item_count = 0;
for i in 0..node.child_count() {
if let Some(child) = node.child(i) {
if child.kind() == "list_item" {
item_count += 1;
if item_count == 1 {
for j in 0..child.child_count() {
if let Some(marker) = child.child(j) {
if marker.kind() == "list_marker_dot" {
list_type = "ordered";
break;
}
}
}
}
}
}
}
chunks.push(SymbolChunk {
symbol_name: Some(format!("List ({} items)", item_count)),
kind: "markdown_section".to_string(),
signature: None,
docstring: None,
start_line,
end_line,
metadata: Some(serde_json::json!({
"list_type": list_type,
"item_count": item_count
})),
});
}
fn extract_markdown_links(source: &str, chunks: &mut Vec<SymbolChunk>) {
let link_pattern = Regex::new(r"(?m)(!?)\[([^\]]*)\]\(([^)]+)\)").unwrap();
for cap in link_pattern.captures_iter(source) {
let is_image = cap.get(1).is_some_and(|m| m.as_str() == "!");
let link_text = cap.get(2).map_or("", |m| m.as_str());
let target = cap.get(3).map_or("", |m| m.as_str());
if target.trim().is_empty() {
continue;
}
let link_type = classify_link(target);
let full_match = cap.get(0).unwrap();
let link_position = full_match.start();
let line_number = find_line_number(source, link_position);
let metadata = serde_json::json!({
"link_type": link_type,
"target": target,
"link_text": link_text,
"is_image": is_image,
});
let kind = if is_image { "image_link" } else { "link" };
let symbol_name = if !link_text.is_empty() {
Some(link_text.to_string())
} else {
Some(target.to_string())
};
chunks.push(SymbolChunk {
symbol_name,
kind: kind.to_string(),
signature: Some(target.to_string()),
docstring: None,
start_line: line_number as i32,
end_line: line_number as i32,
metadata: Some(metadata),
});
}
}
fn classify_link(target: &str) -> String {
if target.starts_with("http://") || target.starts_with("https://") {
"external".to_string()
} else if target.starts_with('#') {
"anchor".to_string()
} else if target.starts_with('/') {
"absolute".to_string()
} else {
"relative".to_string()
}
}
fn find_line_number(source: &str, position: usize) -> usize {
let mut current_pos = 0;
for (line_idx, line) in source.lines().enumerate() {
let line_len = line.len() + 1; if current_pos + line_len > position {
return line_idx + 1; }
current_pos += line_len;
}
1 }