1use super::CodeChunk;
2use crate::indexer::ast_parser::AstParser;
3use crate::indexer::file_info::FileInfo;
4use crate::types::ChunkMetadata;
5use std::time::{SystemTime, UNIX_EPOCH};
6
7pub enum ChunkStrategy {
9 FixedLines(usize),
11 SlidingWindow { size: usize, overlap: usize },
13 AstBased,
15 Hybrid { fallback_lines: usize },
17}
18
19pub struct CodeChunker {
20 strategy: ChunkStrategy,
21}
22
23impl CodeChunker {
24 pub fn new(strategy: ChunkStrategy) -> Self {
25 Self { strategy }
26 }
27
28 pub fn default_strategy() -> Self {
30 Self::new(ChunkStrategy::Hybrid { fallback_lines: 50 })
31 }
32
33 pub fn chunk_file(&self, file_info: &FileInfo) -> Vec<CodeChunk> {
35 match &self.strategy {
36 ChunkStrategy::FixedLines(lines_per_chunk) => {
37 self.chunk_fixed_lines(file_info, *lines_per_chunk)
38 }
39 ChunkStrategy::SlidingWindow { size, overlap } => {
40 self.chunk_sliding_window(file_info, *size, *overlap)
41 }
42 ChunkStrategy::AstBased => self.chunk_ast_based(file_info),
43 ChunkStrategy::Hybrid { fallback_lines } => {
44 let ast_chunks = self.chunk_ast_based(file_info);
46 if ast_chunks.is_empty() {
47 self.chunk_fixed_lines(file_info, *fallback_lines)
48 } else {
49 ast_chunks
50 }
51 }
52 }
53 }
54
55 fn chunk_fixed_lines(&self, file_info: &FileInfo, lines_per_chunk: usize) -> Vec<CodeChunk> {
57 let lines: Vec<&str> = file_info.content.lines().collect();
58 let mut chunks = Vec::new();
59
60 if lines.is_empty() {
61 return chunks;
62 }
63
64 let timestamp = SystemTime::now()
65 .duration_since(UNIX_EPOCH)
66 .unwrap()
67 .as_secs() as i64;
68
69 for (chunk_idx, chunk_lines) in lines.chunks(lines_per_chunk).enumerate() {
70 let start_line = chunk_idx * lines_per_chunk + 1;
71 let end_line = start_line + chunk_lines.len() - 1;
72 let content = chunk_lines.join("\n");
73
74 if content.trim().is_empty() {
76 continue;
77 }
78
79 let metadata = ChunkMetadata {
80 file_path: file_info.relative_path.clone(),
81 root_path: Some(file_info.root_path.clone()),
82 project: file_info.project.clone(),
83 start_line,
84 end_line,
85 language: file_info.language.clone(),
86 extension: file_info.extension.clone(),
87 file_hash: file_info.hash.clone(),
88 indexed_at: timestamp,
89 };
90
91 chunks.push(CodeChunk { content, metadata });
92 }
93
94 chunks
95 }
96
97 fn chunk_sliding_window(
99 &self,
100 file_info: &FileInfo,
101 size: usize,
102 overlap: usize,
103 ) -> Vec<CodeChunk> {
104 let lines: Vec<&str> = file_info.content.lines().collect();
105 let mut chunks = Vec::new();
106
107 if lines.is_empty() {
108 return chunks;
109 }
110
111 let timestamp = SystemTime::now()
112 .duration_since(UNIX_EPOCH)
113 .unwrap()
114 .as_secs() as i64;
115
116 let step = if overlap < size { size - overlap } else { 1 };
117 let mut start_idx = 0;
118
119 while start_idx < lines.len() {
120 let end_idx = (start_idx + size).min(lines.len());
121 let chunk_lines = &lines[start_idx..end_idx];
122 let content = chunk_lines.join("\n");
123
124 if content.trim().is_empty() {
126 start_idx += step;
127 continue;
128 }
129
130 let start_line = start_idx + 1;
131 let end_line = end_idx;
132
133 let metadata = ChunkMetadata {
134 file_path: file_info.relative_path.clone(),
135 root_path: Some(file_info.root_path.clone()),
136 project: file_info.project.clone(),
137 start_line,
138 end_line,
139 language: file_info.language.clone(),
140 extension: file_info.extension.clone(),
141 file_hash: file_info.hash.clone(),
142 indexed_at: timestamp,
143 };
144
145 chunks.push(CodeChunk { content, metadata });
146
147 if end_idx >= lines.len() {
149 break;
150 }
151
152 start_idx += step;
153 }
154
155 chunks
156 }
157
158 fn chunk_ast_based(&self, file_info: &FileInfo) -> Vec<CodeChunk> {
160 let extension = match &file_info.extension {
162 Some(ext) => ext,
163 None => {
164 tracing::debug!("No extension for AST parsing: {:?}", file_info.path);
165 return Vec::new();
166 }
167 };
168
169 let mut parser = match AstParser::new(extension) {
171 Ok(p) => p,
172 Err(_) => {
173 tracing::debug!("Unsupported language for AST parsing: {}", extension);
174 return Vec::new();
175 }
176 };
177
178 let ast_nodes = match parser.parse(&file_info.content) {
180 Ok(nodes) => nodes,
181 Err(e) => {
182 tracing::warn!("Failed to parse file {:?}: {}", file_info.path, e);
183 return Vec::new();
184 }
185 };
186
187 let timestamp = SystemTime::now()
188 .duration_since(UNIX_EPOCH)
189 .unwrap()
190 .as_secs() as i64;
191
192 let mut chunks = Vec::new();
193 let lines: Vec<&str> = file_info.content.lines().collect();
194
195 for ast_node in ast_nodes {
196 let start_idx = ast_node.start_line.saturating_sub(1);
198 let end_idx = ast_node.end_line.min(lines.len());
199
200 if start_idx >= end_idx {
201 continue;
202 }
203
204 let chunk_lines = &lines[start_idx..end_idx];
205 let content = chunk_lines.join("\n");
206
207 if content.trim().is_empty() {
209 continue;
210 }
211
212 let metadata = ChunkMetadata {
213 file_path: file_info.relative_path.clone(),
214 root_path: Some(file_info.root_path.clone()),
215 project: file_info.project.clone(),
216 start_line: ast_node.start_line,
217 end_line: ast_node.end_line,
218 language: file_info.language.clone(),
219 extension: file_info.extension.clone(),
220 file_hash: file_info.hash.clone(),
221 indexed_at: timestamp,
222 };
223
224 chunks.push(CodeChunk { content, metadata });
225 }
226
227 if chunks.is_empty() {
229 tracing::debug!("No AST chunks created for {:?}", file_info.path);
230 }
231
232 chunks
233 }
234}
235
236impl Default for CodeChunker {
237 fn default() -> Self {
238 Self::default_strategy()
239 }
240}
241
242#[cfg(test)]
243mod tests {
244 use super::*;
245 use std::path::PathBuf;
246
247 fn create_test_file_info(content: &str) -> FileInfo {
248 FileInfo {
249 path: PathBuf::from("test.rs"),
250 relative_path: "test.rs".to_string(),
251 root_path: "/test/root".to_string(),
252 project: None,
253 extension: Some("rs".to_string()),
254 language: Some("Rust".to_string()),
255 content: content.to_string(),
256 hash: "test_hash".to_string(),
257 }
258 }
259
260 #[test]
261 fn test_fixed_lines_chunking() {
262 let content = (1..=100)
263 .map(|i| format!("line {}", i))
264 .collect::<Vec<_>>()
265 .join("\n");
266 let file_info = create_test_file_info(&content);
267
268 let chunker = CodeChunker::new(ChunkStrategy::FixedLines(10));
269 let chunks = chunker.chunk_file(&file_info);
270
271 assert_eq!(chunks.len(), 10);
272 assert_eq!(chunks[0].metadata.start_line, 1);
273 assert_eq!(chunks[0].metadata.end_line, 10);
274 assert_eq!(chunks[9].metadata.start_line, 91);
275 assert_eq!(chunks[9].metadata.end_line, 100);
276 }
277
278 #[test]
279 fn test_sliding_window_chunking() {
280 let content = (1..=20)
281 .map(|i| format!("line {}", i))
282 .collect::<Vec<_>>()
283 .join("\n");
284 let file_info = create_test_file_info(&content);
285
286 let chunker = CodeChunker::new(ChunkStrategy::SlidingWindow {
287 size: 10,
288 overlap: 5,
289 });
290 let chunks = chunker.chunk_file(&file_info);
291
292 assert!(chunks.len() >= 3);
295 assert_eq!(chunks[0].metadata.start_line, 1);
296 }
297
298 #[test]
299 fn test_default_strategy() {
300 let chunker = CodeChunker::default_strategy();
301 assert!(matches!(chunker.strategy, ChunkStrategy::Hybrid { .. }));
302 }
303
304 #[test]
305 fn test_default() {
306 let chunker = CodeChunker::default();
307 assert!(matches!(chunker.strategy, ChunkStrategy::Hybrid { .. }));
308 }
309
310 #[test]
311 fn test_empty_file() {
312 let file_info = create_test_file_info("");
313 let chunker = CodeChunker::new(ChunkStrategy::FixedLines(10));
314 let chunks = chunker.chunk_file(&file_info);
315 assert_eq!(chunks.len(), 0);
316 }
317
318 #[test]
319 fn test_whitespace_only_file() {
320 let file_info = create_test_file_info(" \n\t\n ");
321 let chunker = CodeChunker::new(ChunkStrategy::FixedLines(10));
322 let chunks = chunker.chunk_file(&file_info);
323 assert_eq!(chunks.len(), 0);
324 }
325
326 #[test]
327 fn test_single_line_file() {
328 let file_info = create_test_file_info("fn main() {}");
329 let chunker = CodeChunker::new(ChunkStrategy::FixedLines(10));
330 let chunks = chunker.chunk_file(&file_info);
331 assert_eq!(chunks.len(), 1);
332 assert_eq!(chunks[0].metadata.start_line, 1);
333 assert_eq!(chunks[0].metadata.end_line, 1);
334 }
335
336 #[test]
337 fn test_sliding_window_overlap_equal_size() {
338 let content = (1..=20)
339 .map(|i| format!("line {}", i))
340 .collect::<Vec<_>>()
341 .join("\n");
342 let file_info = create_test_file_info(&content);
343
344 let chunker = CodeChunker::new(ChunkStrategy::SlidingWindow {
345 size: 10,
346 overlap: 10,
347 });
348 let chunks = chunker.chunk_file(&file_info);
349 assert!(chunks.len() > 10);
351 }
352
353 #[test]
354 fn test_sliding_window_overlap_greater_than_size() {
355 let content = (1..=20)
356 .map(|i| format!("line {}", i))
357 .collect::<Vec<_>>()
358 .join("\n");
359 let file_info = create_test_file_info(&content);
360
361 let chunker = CodeChunker::new(ChunkStrategy::SlidingWindow {
362 size: 10,
363 overlap: 15,
364 });
365 let chunks = chunker.chunk_file(&file_info);
366 assert!(chunks.len() > 10);
368 }
369
370 #[test]
371 fn test_ast_based_rust() {
372 let content = r#"
373fn hello() {
374 println!("Hello");
375}
376
377fn world() {
378 println!("World");
379}
380"#;
381 let file_info = create_test_file_info(content);
382 let chunker = CodeChunker::new(ChunkStrategy::AstBased);
383 let chunks = chunker.chunk_file(&file_info);
384 assert!(chunks.len() >= 2);
386 }
387
388 #[test]
389 fn test_ast_based_no_extension() {
390 let mut file_info = create_test_file_info("fn main() {}");
391 file_info.extension = None;
392 let chunker = CodeChunker::new(ChunkStrategy::AstBased);
393 let chunks = chunker.chunk_file(&file_info);
394 assert_eq!(chunks.len(), 0);
395 }
396
397 #[test]
398 fn test_ast_based_unsupported_language() {
399 let mut file_info = create_test_file_info("some content");
400 file_info.extension = Some("txt".to_string());
401 let chunker = CodeChunker::new(ChunkStrategy::AstBased);
402 let chunks = chunker.chunk_file(&file_info);
403 assert_eq!(chunks.len(), 0);
404 }
405
406 #[test]
407 fn test_hybrid_with_ast_success() {
408 let content = r#"
409fn hello() {
410 println!("Hello");
411}
412"#;
413 let file_info = create_test_file_info(content);
414 let chunker = CodeChunker::new(ChunkStrategy::Hybrid { fallback_lines: 50 });
415 let chunks = chunker.chunk_file(&file_info);
416 assert!(!chunks.is_empty());
418 }
419
420 #[test]
421 fn test_hybrid_fallback_to_fixed() {
422 let mut file_info = create_test_file_info("line 1\nline 2\nline 3");
423 file_info.extension = Some("txt".to_string());
424 let chunker = CodeChunker::new(ChunkStrategy::Hybrid { fallback_lines: 2 });
425 let chunks = chunker.chunk_file(&file_info);
426 assert!(!chunks.is_empty());
428 }
429
430 #[test]
431 fn test_metadata_fields() {
432 let mut file_info = create_test_file_info("fn main() {}");
433 file_info.project = Some("test-project".to_string());
434 file_info.hash = "abc123".to_string();
435
436 let chunker = CodeChunker::new(ChunkStrategy::FixedLines(10));
437 let chunks = chunker.chunk_file(&file_info);
438
439 assert_eq!(chunks.len(), 1);
440 let chunk = &chunks[0];
441 assert_eq!(chunk.metadata.file_path, "test.rs");
442 assert_eq!(chunk.metadata.project, Some("test-project".to_string()));
443 assert_eq!(chunk.metadata.language, Some("Rust".to_string()));
444 assert_eq!(chunk.metadata.extension, Some("rs".to_string()));
445 assert_eq!(chunk.metadata.file_hash, "abc123");
446 assert!(chunk.metadata.indexed_at > 0);
447 }
448
449 #[test]
450 fn test_sliding_window_empty_chunks_skipped() {
451 let content = "line 1\n\n\n\nline 5";
452 let file_info = create_test_file_info(content);
453 let chunker = CodeChunker::new(ChunkStrategy::SlidingWindow {
454 size: 2,
455 overlap: 0,
456 });
457 let chunks = chunker.chunk_file(&file_info);
458 assert!(!chunks.is_empty());
460 for chunk in chunks {
461 assert!(!chunk.content.trim().is_empty());
462 }
463 }
464
465 #[test]
466 fn test_fixed_lines_empty_chunks_skipped() {
467 let content = "line 1\n\n\nline 4";
468 let file_info = create_test_file_info(content);
469 let chunker = CodeChunker::new(ChunkStrategy::FixedLines(2));
470 let chunks = chunker.chunk_file(&file_info);
471 for chunk in chunks {
473 assert!(!chunk.content.trim().is_empty());
474 }
475 }
476
477 #[test]
478 fn test_ast_based_invalid_syntax() {
479 let content = "fn incomplete {"; let file_info = create_test_file_info(content);
481 let chunker = CodeChunker::new(ChunkStrategy::AstBased);
482 let chunks = chunker.chunk_file(&file_info);
483 assert_eq!(chunks.len(), 0);
485 }
486}