agentroot_core/index/ast_chunker/strategies/
mod.rs

1//! Language-specific chunking strategies
2
3mod go;
4mod javascript;
5mod python;
6mod rust;
7
8pub use go::GoStrategy;
9pub use javascript::JavaScriptStrategy;
10pub use python::PythonStrategy;
11pub use rust::RustStrategy;
12
13use super::language::Language;
14use super::types::{ChunkType, SemanticChunk};
15use crate::error::Result;
16use tree_sitter::Node;
17
18/// Trait for language-specific semantic chunking
19pub trait ChunkingStrategy: Send + Sync {
20    /// Get the node types that represent semantic boundaries
21    fn semantic_node_types(&self) -> &[&str];
22
23    /// Extract chunks from the source given the AST root
24    fn extract_chunks(&self, source: &str, root: Node) -> Result<Vec<SemanticChunk>>;
25
26    /// Determine chunk type from AST node
27    fn chunk_type_for_node(&self, node: Node) -> ChunkType;
28
29    /// Extract leading trivia (comments/docs) for a node
30    fn extract_leading_trivia(&self, source: &str, node: Node) -> String {
31        extract_leading_comments(source, node)
32    }
33
34    /// Extract trailing trivia for a node
35    fn extract_trailing_trivia(&self, source: &str, node: Node) -> String {
36        extract_trailing_comment(source, node)
37    }
38}
39
40/// Enum-based strategy dispatch to avoid heap allocation
41pub enum LanguageStrategy {
42    Rust(RustStrategy),
43    Python(PythonStrategy),
44    JavaScript(JavaScriptStrategy),
45    Go(GoStrategy),
46}
47
48impl LanguageStrategy {
49    pub fn for_language(language: Language) -> Self {
50        match language {
51            Language::Rust => Self::Rust(RustStrategy),
52            Language::Python => Self::Python(PythonStrategy),
53            Language::JavaScript => Self::JavaScript(JavaScriptStrategy::javascript()),
54            Language::TypeScript | Language::TypeScriptTsx => {
55                Self::JavaScript(JavaScriptStrategy::typescript())
56            }
57            Language::Go => Self::Go(GoStrategy),
58        }
59    }
60
61    pub fn extract_chunks(&self, source: &str, root: Node) -> Result<Vec<SemanticChunk>> {
62        match self {
63            Self::Rust(s) => s.extract_chunks(source, root),
64            Self::Python(s) => s.extract_chunks(source, root),
65            Self::JavaScript(s) => s.extract_chunks(source, root),
66            Self::Go(s) => s.extract_chunks(source, root),
67        }
68    }
69}
70
71/// Extract leading comments/docs above a node
72pub fn extract_leading_comments(source: &str, node: Node) -> String {
73    let start_byte = node.start_byte();
74    if start_byte == 0 {
75        return String::new();
76    }
77
78    let preceding = &source[..start_byte];
79    let lines: Vec<&str> = preceding.lines().rev().collect();
80    let mut trivia_lines = Vec::new();
81
82    for line in lines {
83        let trimmed = line.trim();
84        if trimmed.is_empty() {
85            if !trivia_lines.is_empty() {
86                break;
87            }
88            continue;
89        }
90        if is_comment_line(trimmed) {
91            trivia_lines.push(line);
92        } else {
93            break;
94        }
95    }
96
97    trivia_lines.reverse();
98    if trivia_lines.is_empty() {
99        String::new()
100    } else {
101        trivia_lines.join("\n")
102    }
103}
104
105/// Check if a line is a comment
106fn is_comment_line(line: &str) -> bool {
107    line.starts_with("//")
108        || line.starts_with('#')
109        || line.starts_with("/*")
110        || line.starts_with('*')
111        || line.starts_with("*/")
112        || line.starts_with("///")
113        || line.starts_with("//!")
114        || line.starts_with("\"\"\"")
115        || line.starts_with("'''")
116}
117
118/// Extract trailing comment on the same line
119pub fn extract_trailing_comment(source: &str, node: Node) -> String {
120    let end_byte = node.end_byte();
121    if end_byte >= source.len() {
122        return String::new();
123    }
124
125    let following = &source[end_byte..];
126    if let Some(line_end) = following.find('\n') {
127        let same_line = following[..line_end].trim();
128        if same_line.starts_with("//") || same_line.starts_with('#') {
129            return same_line.to_string();
130        }
131    }
132    String::new()
133}
134
135/// Compute line numbers for a byte range
136pub fn line_numbers(source: &str, start_byte: usize, end_byte: usize) -> (usize, usize) {
137    let start_line = source[..start_byte].matches('\n').count() + 1;
138    let end_line = source[..end_byte].matches('\n').count() + 1;
139    (start_line, end_line)
140}
141
142/// Get breadcrumb path for a node (e.g., "ClassName::method_name")
143pub fn get_breadcrumb(source: &str, node: Node) -> Option<String> {
144    let mut parts = Vec::new();
145    let mut current = Some(node);
146
147    while let Some(n) = current {
148        if let Some(name) = extract_name_from_node(source, n) {
149            parts.push(name);
150        }
151        current = n.parent();
152    }
153
154    if parts.is_empty() {
155        None
156    } else {
157        parts.reverse();
158        Some(parts.join("::"))
159    }
160}
161
162/// Extract name identifier from a node
163fn extract_name_from_node(source: &str, node: Node) -> Option<String> {
164    let kind = node.kind();
165    let name_field = match kind {
166        "function_item"
167        | "function_definition"
168        | "function_declaration"
169        | "method_definition"
170        | "method_declaration" => "name",
171        "impl_item" => "type",
172        "struct_item" | "class_definition" | "class_declaration" => "name",
173        "enum_item" | "type_declaration" => "name",
174        "trait_item" | "interface_declaration" => "name",
175        _ => return None,
176    };
177
178    node.child_by_field_name(name_field)
179        .map(|n| source[n.start_byte()..n.end_byte()].to_string())
180}