agentroot_core/index/ast_chunker/
mod.rs1pub mod language;
8pub mod oversized;
9pub mod parser;
10pub mod strategies;
11pub mod types;
12
13pub use language::{is_supported, Language};
14pub use oversized::{split_oversized_chunk, split_oversized_chunks};
15pub use strategies::{
16 ChunkingStrategy, GoStrategy, JavaScriptStrategy, LanguageStrategy, PythonStrategy,
17 RustStrategy,
18};
19pub use types::{compute_chunk_hash, ChunkMetadata, ChunkType, SemanticChunk};
20
21use super::chunker::{chunk_by_chars, Chunk, CHUNK_OVERLAP_CHARS, CHUNK_SIZE_CHARS};
22use crate::error::Result;
23use std::path::Path;
24use tracing::debug;
25
26const MIN_CHUNK_CHARS: usize = 1;
27
28pub struct SemanticChunker {
30 max_chunk_chars: usize,
31}
32
33impl Default for SemanticChunker {
34 fn default() -> Self {
35 Self::new()
36 }
37}
38
39impl SemanticChunker {
40 pub fn new() -> Self {
41 Self {
42 max_chunk_chars: CHUNK_SIZE_CHARS,
43 }
44 }
45
46 pub fn with_max_chunk_chars(self, max: usize) -> Self {
47 let max = if max < MIN_CHUNK_CHARS {
48 MIN_CHUNK_CHARS
49 } else {
50 max
51 };
52 Self {
53 max_chunk_chars: max,
54 }
55 }
56
57 pub fn chunk(&self, content: &str, path: &Path) -> Result<Vec<SemanticChunk>> {
62 let language = match Language::from_path(path) {
63 Some(lang) => lang,
64 None => return self.fallback_chunk(content),
65 };
66
67 let tree = match parser::parse(content, language) {
68 Ok(tree) => tree,
69 Err(e) => {
70 debug!(
71 error = %e,
72 path = %path.display(),
73 language = %language.as_str(),
74 "AST parse failed, falling back to character-based chunking"
75 );
76 return self.fallback_chunk(content);
77 }
78 };
79
80 let strategy = LanguageStrategy::for_language(language);
81 let chunks = strategy.extract_chunks(content, tree.root_node())?;
82 let chunks = split_oversized_chunks(chunks, self.max_chunk_chars);
83
84 Ok(chunks)
85 }
86
87 fn fallback_chunk(&self, content: &str) -> Result<Vec<SemanticChunk>> {
89 let char_chunks = chunk_by_chars(content, CHUNK_SIZE_CHARS, CHUNK_OVERLAP_CHARS);
90
91 let semantic_chunks = char_chunks
92 .into_iter()
93 .map(|c| {
94 let hash = compute_chunk_hash(&c.text, "", "");
95 SemanticChunk {
96 text: c.text,
97 chunk_type: ChunkType::Text,
98 chunk_hash: hash,
99 position: c.position,
100 token_count: c.token_count,
101 metadata: ChunkMetadata::default(),
102 }
103 })
104 .collect();
105
106 Ok(semantic_chunks)
107 }
108}
109
110pub fn chunk_semantic(content: &str, path: &Path) -> Result<Vec<SemanticChunk>> {
112 SemanticChunker::new().chunk(content, path)
113}
114
115impl From<SemanticChunk> for Chunk {
117 fn from(sc: SemanticChunk) -> Self {
118 Chunk {
119 text: sc.text,
120 position: sc.position,
121 token_count: sc.token_count,
122 }
123 }
124}
125
126#[cfg(test)]
127mod tests {
128 use super::*;
129
130 #[test]
131 fn test_rust_file_chunking() {
132 let content = r#"
133/// A greeting function
134fn hello() {
135 println!("Hello, world!");
136}
137
138struct Point {
139 x: i32,
140 y: i32,
141}
142"#;
143 let path = Path::new("test.rs");
144 let chunks = chunk_semantic(content, path).unwrap();
145
146 assert!(chunks.len() >= 2);
147 assert!(chunks.iter().any(|c| c.chunk_type == ChunkType::Function));
148 assert!(chunks.iter().any(|c| c.chunk_type == ChunkType::Struct));
149 }
150
151 #[test]
152 fn test_python_file_chunking() {
153 let content = r#"
154def greet(name):
155 """Greet someone."""
156 print(f"Hello, {name}!")
157
158class Greeter:
159 def __init__(self):
160 pass
161"#;
162 let path = Path::new("test.py");
163 let chunks = chunk_semantic(content, path).unwrap();
164
165 assert!(!chunks.is_empty());
166 }
167
168 #[test]
169 fn test_markdown_fallback() {
170 let content = "# Hello\n\nThis is markdown content.";
171 let path = Path::new("test.md");
172 let chunks = chunk_semantic(content, path).unwrap();
173
174 assert_eq!(chunks.len(), 1);
175 assert_eq!(chunks[0].chunk_type, ChunkType::Text);
176 }
177
178 #[test]
179 fn test_chunk_hash_in_semantic_chunks() {
180 let content = "fn test() {}";
181 let path = Path::new("test.rs");
182 let chunks = chunk_semantic(content, path).unwrap();
183
184 for chunk in &chunks {
185 assert_eq!(chunk.chunk_hash.len(), 32);
186 }
187 }
188
189 #[test]
190 fn test_semantic_to_basic_chunk_conversion() {
191 let semantic = SemanticChunk::new("test".to_string(), ChunkType::Function, 0);
192 let basic: Chunk = semantic.into();
193
194 assert_eq!(basic.text, "test");
195 assert_eq!(basic.position, 0);
196 }
197
198 #[test]
199 fn test_with_max_chunk_chars_validation() {
200 let chunker = SemanticChunker::new().with_max_chunk_chars(0);
201 assert_eq!(chunker.max_chunk_chars, MIN_CHUNK_CHARS);
202
203 let chunker = SemanticChunker::new().with_max_chunk_chars(500);
204 assert_eq!(chunker.max_chunk_chars, 500);
205 }
206}