agentroot_core/index/ast_chunker/
mod.rs

1//! AST-aware semantic chunking
2//!
3//! This module provides semantic chunking of source code files using tree-sitter
4//! for AST parsing. It extracts functions, classes, methods, and other semantic
5//! units while preserving context like docstrings and comments.
6
7pub mod language;
8pub mod oversized;
9pub mod parser;
10pub mod strategies;
11pub mod types;
12
13pub use language::{is_supported, Language};
14pub use oversized::{split_oversized_chunk, split_oversized_chunks};
15pub use strategies::{
16    ChunkingStrategy, GoStrategy, JavaScriptStrategy, LanguageStrategy, PythonStrategy,
17    RustStrategy,
18};
19pub use types::{compute_chunk_hash, ChunkMetadata, ChunkType, SemanticChunk};
20
21use super::chunker::{chunk_by_chars, Chunk, CHUNK_OVERLAP_CHARS, CHUNK_SIZE_CHARS};
22use crate::error::Result;
23use std::path::Path;
24use tracing::debug;
25
26const MIN_CHUNK_CHARS: usize = 1;
27
28/// Main semantic chunker that delegates to language-specific strategies
29pub struct SemanticChunker {
30    max_chunk_chars: usize,
31}
32
33impl Default for SemanticChunker {
34    fn default() -> Self {
35        Self::new()
36    }
37}
38
39impl SemanticChunker {
40    pub fn new() -> Self {
41        Self {
42            max_chunk_chars: CHUNK_SIZE_CHARS,
43        }
44    }
45
46    pub fn with_max_chunk_chars(self, max: usize) -> Self {
47        let max = if max < MIN_CHUNK_CHARS {
48            MIN_CHUNK_CHARS
49        } else {
50            max
51        };
52        Self {
53            max_chunk_chars: max,
54        }
55    }
56
57    /// Chunk content semantically based on file path
58    ///
59    /// For supported languages, uses AST-based chunking.
60    /// For unsupported languages, falls back to character-based chunking.
61    pub fn chunk(&self, content: &str, path: &Path) -> Result<Vec<SemanticChunk>> {
62        let language = match Language::from_path(path) {
63            Some(lang) => lang,
64            None => return self.fallback_chunk(content),
65        };
66
67        let tree = match parser::parse(content, language) {
68            Ok(tree) => tree,
69            Err(e) => {
70                debug!(
71                    error = %e,
72                    path = %path.display(),
73                    language = %language.as_str(),
74                    "AST parse failed, falling back to character-based chunking"
75                );
76                return self.fallback_chunk(content);
77            }
78        };
79
80        let strategy = LanguageStrategy::for_language(language);
81        let chunks = strategy.extract_chunks(content, tree.root_node())?;
82        let chunks = split_oversized_chunks(chunks, self.max_chunk_chars);
83
84        Ok(chunks)
85    }
86
87    /// Fallback to character-based chunking for unsupported files
88    fn fallback_chunk(&self, content: &str) -> Result<Vec<SemanticChunk>> {
89        let char_chunks = chunk_by_chars(content, CHUNK_SIZE_CHARS, CHUNK_OVERLAP_CHARS);
90
91        let semantic_chunks = char_chunks
92            .into_iter()
93            .map(|c| {
94                let hash = compute_chunk_hash(&c.text, "", "");
95                SemanticChunk {
96                    text: c.text,
97                    chunk_type: ChunkType::Text,
98                    chunk_hash: hash,
99                    position: c.position,
100                    token_count: c.token_count,
101                    metadata: ChunkMetadata::default(),
102                }
103            })
104            .collect();
105
106        Ok(semantic_chunks)
107    }
108}
109
110/// Convenience function for semantic chunking
111pub fn chunk_semantic(content: &str, path: &Path) -> Result<Vec<SemanticChunk>> {
112    SemanticChunker::new().chunk(content, path)
113}
114
115/// Convert a SemanticChunk to a basic Chunk (for backwards compatibility)
116impl From<SemanticChunk> for Chunk {
117    fn from(sc: SemanticChunk) -> Self {
118        Chunk {
119            text: sc.text,
120            position: sc.position,
121            token_count: sc.token_count,
122        }
123    }
124}
125
126#[cfg(test)]
127mod tests {
128    use super::*;
129
130    #[test]
131    fn test_rust_file_chunking() {
132        let content = r#"
133/// A greeting function
134fn hello() {
135    println!("Hello, world!");
136}
137
138struct Point {
139    x: i32,
140    y: i32,
141}
142"#;
143        let path = Path::new("test.rs");
144        let chunks = chunk_semantic(content, path).unwrap();
145
146        assert!(chunks.len() >= 2);
147        assert!(chunks.iter().any(|c| c.chunk_type == ChunkType::Function));
148        assert!(chunks.iter().any(|c| c.chunk_type == ChunkType::Struct));
149    }
150
151    #[test]
152    fn test_python_file_chunking() {
153        let content = r#"
154def greet(name):
155    """Greet someone."""
156    print(f"Hello, {name}!")
157
158class Greeter:
159    def __init__(self):
160        pass
161"#;
162        let path = Path::new("test.py");
163        let chunks = chunk_semantic(content, path).unwrap();
164
165        assert!(!chunks.is_empty());
166    }
167
168    #[test]
169    fn test_markdown_fallback() {
170        let content = "# Hello\n\nThis is markdown content.";
171        let path = Path::new("test.md");
172        let chunks = chunk_semantic(content, path).unwrap();
173
174        assert_eq!(chunks.len(), 1);
175        assert_eq!(chunks[0].chunk_type, ChunkType::Text);
176    }
177
178    #[test]
179    fn test_chunk_hash_in_semantic_chunks() {
180        let content = "fn test() {}";
181        let path = Path::new("test.rs");
182        let chunks = chunk_semantic(content, path).unwrap();
183
184        for chunk in &chunks {
185            assert_eq!(chunk.chunk_hash.len(), 32);
186        }
187    }
188
189    #[test]
190    fn test_semantic_to_basic_chunk_conversion() {
191        let semantic = SemanticChunk::new("test".to_string(), ChunkType::Function, 0);
192        let basic: Chunk = semantic.into();
193
194        assert_eq!(basic.text, "test");
195        assert_eq!(basic.position, 0);
196    }
197
198    #[test]
199    fn test_with_max_chunk_chars_validation() {
200        let chunker = SemanticChunker::new().with_max_chunk_chars(0);
201        assert_eq!(chunker.max_chunk_chars, MIN_CHUNK_CHARS);
202
203        let chunker = SemanticChunker::new().with_max_chunk_chars(500);
204        assert_eq!(chunker.max_chunk_chars, 500);
205    }
206}