project_rag/indexer/
chunker.rs

1use super::CodeChunk;
2use crate::indexer::ast_parser::AstParser;
3use crate::indexer::file_info::FileInfo;
4use crate::types::ChunkMetadata;
5use std::time::{SystemTime, UNIX_EPOCH};
6
7/// Strategy for chunking code
8pub enum ChunkStrategy {
9    /// Fixed number of lines per chunk
10    FixedLines(usize),
11    /// Sliding window with overlap
12    SlidingWindow { size: usize, overlap: usize },
13    /// AST-based chunking (functions, classes, methods)
14    AstBased,
15    /// Hybrid: AST-based with fallback to fixed lines
16    Hybrid { fallback_lines: usize },
17}
18
19pub struct CodeChunker {
20    strategy: ChunkStrategy,
21}
22
23impl CodeChunker {
24    pub fn new(strategy: ChunkStrategy) -> Self {
25        Self { strategy }
26    }
27
28    /// Create a chunker with default strategy (Hybrid AST with 50 line fallback)
29    pub fn default_strategy() -> Self {
30        Self::new(ChunkStrategy::Hybrid { fallback_lines: 50 })
31    }
32
33    /// Chunk a file into multiple code chunks
34    pub fn chunk_file(&self, file_info: &FileInfo) -> Vec<CodeChunk> {
35        match &self.strategy {
36            ChunkStrategy::FixedLines(lines_per_chunk) => {
37                self.chunk_fixed_lines(file_info, *lines_per_chunk)
38            }
39            ChunkStrategy::SlidingWindow { size, overlap } => {
40                self.chunk_sliding_window(file_info, *size, *overlap)
41            }
42            ChunkStrategy::AstBased => self.chunk_ast_based(file_info),
43            ChunkStrategy::Hybrid { fallback_lines } => {
44                // Try AST-based first, fallback to fixed lines if it fails
45                let ast_chunks = self.chunk_ast_based(file_info);
46                if ast_chunks.is_empty() {
47                    self.chunk_fixed_lines(file_info, *fallback_lines)
48                } else {
49                    ast_chunks
50                }
51            }
52        }
53    }
54
55    /// Chunk using fixed number of lines
56    fn chunk_fixed_lines(&self, file_info: &FileInfo, lines_per_chunk: usize) -> Vec<CodeChunk> {
57        let lines: Vec<&str> = file_info.content.lines().collect();
58        let mut chunks = Vec::new();
59
60        if lines.is_empty() {
61            return chunks;
62        }
63
64        let timestamp = SystemTime::now()
65            .duration_since(UNIX_EPOCH)
66            .unwrap()
67            .as_secs() as i64;
68
69        for (chunk_idx, chunk_lines) in lines.chunks(lines_per_chunk).enumerate() {
70            let start_line = chunk_idx * lines_per_chunk + 1;
71            let end_line = start_line + chunk_lines.len() - 1;
72            let content = chunk_lines.join("\n");
73
74            // Skip empty chunks
75            if content.trim().is_empty() {
76                continue;
77            }
78
79            let metadata = ChunkMetadata {
80                file_path: file_info.relative_path.clone(),
81                root_path: Some(file_info.root_path.clone()),
82                project: file_info.project.clone(),
83                start_line,
84                end_line,
85                language: file_info.language.clone(),
86                extension: file_info.extension.clone(),
87                file_hash: file_info.hash.clone(),
88                indexed_at: timestamp,
89            };
90
91            chunks.push(CodeChunk { content, metadata });
92        }
93
94        chunks
95    }
96
97    /// Chunk using sliding window with overlap
98    fn chunk_sliding_window(
99        &self,
100        file_info: &FileInfo,
101        size: usize,
102        overlap: usize,
103    ) -> Vec<CodeChunk> {
104        let lines: Vec<&str> = file_info.content.lines().collect();
105        let mut chunks = Vec::new();
106
107        if lines.is_empty() {
108            return chunks;
109        }
110
111        let timestamp = SystemTime::now()
112            .duration_since(UNIX_EPOCH)
113            .unwrap()
114            .as_secs() as i64;
115
116        let step = if overlap < size { size - overlap } else { 1 };
117        let mut start_idx = 0;
118
119        while start_idx < lines.len() {
120            let end_idx = (start_idx + size).min(lines.len());
121            let chunk_lines = &lines[start_idx..end_idx];
122            let content = chunk_lines.join("\n");
123
124            // Skip empty chunks
125            if content.trim().is_empty() {
126                start_idx += step;
127                continue;
128            }
129
130            let start_line = start_idx + 1;
131            let end_line = end_idx;
132
133            let metadata = ChunkMetadata {
134                file_path: file_info.relative_path.clone(),
135                root_path: Some(file_info.root_path.clone()),
136                project: file_info.project.clone(),
137                start_line,
138                end_line,
139                language: file_info.language.clone(),
140                extension: file_info.extension.clone(),
141                file_hash: file_info.hash.clone(),
142                indexed_at: timestamp,
143            };
144
145            chunks.push(CodeChunk { content, metadata });
146
147            // Break if we've reached the end
148            if end_idx >= lines.len() {
149                break;
150            }
151
152            start_idx += step;
153        }
154
155        chunks
156    }
157
158    /// Chunk using AST-based parsing (functions, classes, methods)
159    fn chunk_ast_based(&self, file_info: &FileInfo) -> Vec<CodeChunk> {
160        // Check if we have an extension and can parse it
161        let extension = match &file_info.extension {
162            Some(ext) => ext,
163            None => {
164                tracing::debug!("No extension for AST parsing: {:?}", file_info.path);
165                return Vec::new();
166            }
167        };
168
169        // Try to create parser for this language
170        let mut parser = match AstParser::new(extension) {
171            Ok(p) => p,
172            Err(_) => {
173                tracing::debug!("Unsupported language for AST parsing: {}", extension);
174                return Vec::new();
175            }
176        };
177
178        // Parse the file
179        let ast_nodes = match parser.parse(&file_info.content) {
180            Ok(nodes) => nodes,
181            Err(e) => {
182                tracing::warn!("Failed to parse file {:?}: {}", file_info.path, e);
183                return Vec::new();
184            }
185        };
186
187        let timestamp = SystemTime::now()
188            .duration_since(UNIX_EPOCH)
189            .unwrap()
190            .as_secs() as i64;
191
192        let mut chunks = Vec::new();
193        let lines: Vec<&str> = file_info.content.lines().collect();
194
195        for ast_node in ast_nodes {
196            // Extract the content for this node
197            let start_idx = ast_node.start_line.saturating_sub(1);
198            let end_idx = ast_node.end_line.min(lines.len());
199
200            if start_idx >= end_idx {
201                continue;
202            }
203
204            let chunk_lines = &lines[start_idx..end_idx];
205            let content = chunk_lines.join("\n");
206
207            // Skip empty chunks
208            if content.trim().is_empty() {
209                continue;
210            }
211
212            let metadata = ChunkMetadata {
213                file_path: file_info.relative_path.clone(),
214                root_path: Some(file_info.root_path.clone()),
215                project: file_info.project.clone(),
216                start_line: ast_node.start_line,
217                end_line: ast_node.end_line,
218                language: file_info.language.clone(),
219                extension: file_info.extension.clone(),
220                file_hash: file_info.hash.clone(),
221                indexed_at: timestamp,
222            };
223
224            chunks.push(CodeChunk { content, metadata });
225        }
226
227        // If no chunks were created, log it
228        if chunks.is_empty() {
229            tracing::debug!("No AST chunks created for {:?}", file_info.path);
230        }
231
232        chunks
233    }
234}
235
236impl Default for CodeChunker {
237    fn default() -> Self {
238        Self::default_strategy()
239    }
240}
241
242#[cfg(test)]
243mod tests {
244    use super::*;
245    use std::path::PathBuf;
246
247    fn create_test_file_info(content: &str) -> FileInfo {
248        FileInfo {
249            path: PathBuf::from("test.rs"),
250            relative_path: "test.rs".to_string(),
251            root_path: "/test/root".to_string(),
252            project: None,
253            extension: Some("rs".to_string()),
254            language: Some("Rust".to_string()),
255            content: content.to_string(),
256            hash: "test_hash".to_string(),
257        }
258    }
259
260    #[test]
261    fn test_fixed_lines_chunking() {
262        let content = (1..=100)
263            .map(|i| format!("line {}", i))
264            .collect::<Vec<_>>()
265            .join("\n");
266        let file_info = create_test_file_info(&content);
267
268        let chunker = CodeChunker::new(ChunkStrategy::FixedLines(10));
269        let chunks = chunker.chunk_file(&file_info);
270
271        assert_eq!(chunks.len(), 10);
272        assert_eq!(chunks[0].metadata.start_line, 1);
273        assert_eq!(chunks[0].metadata.end_line, 10);
274        assert_eq!(chunks[9].metadata.start_line, 91);
275        assert_eq!(chunks[9].metadata.end_line, 100);
276    }
277
278    #[test]
279    fn test_sliding_window_chunking() {
280        let content = (1..=20)
281            .map(|i| format!("line {}", i))
282            .collect::<Vec<_>>()
283            .join("\n");
284        let file_info = create_test_file_info(&content);
285
286        let chunker = CodeChunker::new(ChunkStrategy::SlidingWindow {
287            size: 10,
288            overlap: 5,
289        });
290        let chunks = chunker.chunk_file(&file_info);
291
292        // With size=10 and overlap=5, step=5
293        // Chunks: [1-10], [6-15], [11-20]
294        assert!(chunks.len() >= 3);
295        assert_eq!(chunks[0].metadata.start_line, 1);
296    }
297
298    #[test]
299    fn test_default_strategy() {
300        let chunker = CodeChunker::default_strategy();
301        assert!(matches!(chunker.strategy, ChunkStrategy::Hybrid { .. }));
302    }
303
304    #[test]
305    fn test_default() {
306        let chunker = CodeChunker::default();
307        assert!(matches!(chunker.strategy, ChunkStrategy::Hybrid { .. }));
308    }
309
310    #[test]
311    fn test_empty_file() {
312        let file_info = create_test_file_info("");
313        let chunker = CodeChunker::new(ChunkStrategy::FixedLines(10));
314        let chunks = chunker.chunk_file(&file_info);
315        assert_eq!(chunks.len(), 0);
316    }
317
318    #[test]
319    fn test_whitespace_only_file() {
320        let file_info = create_test_file_info("   \n\t\n   ");
321        let chunker = CodeChunker::new(ChunkStrategy::FixedLines(10));
322        let chunks = chunker.chunk_file(&file_info);
323        assert_eq!(chunks.len(), 0);
324    }
325
326    #[test]
327    fn test_single_line_file() {
328        let file_info = create_test_file_info("fn main() {}");
329        let chunker = CodeChunker::new(ChunkStrategy::FixedLines(10));
330        let chunks = chunker.chunk_file(&file_info);
331        assert_eq!(chunks.len(), 1);
332        assert_eq!(chunks[0].metadata.start_line, 1);
333        assert_eq!(chunks[0].metadata.end_line, 1);
334    }
335
336    #[test]
337    fn test_sliding_window_overlap_equal_size() {
338        let content = (1..=20)
339            .map(|i| format!("line {}", i))
340            .collect::<Vec<_>>()
341            .join("\n");
342        let file_info = create_test_file_info(&content);
343
344        let chunker = CodeChunker::new(ChunkStrategy::SlidingWindow {
345            size: 10,
346            overlap: 10,
347        });
348        let chunks = chunker.chunk_file(&file_info);
349        // When overlap equals size, step should be 1
350        assert!(chunks.len() > 10);
351    }
352
353    #[test]
354    fn test_sliding_window_overlap_greater_than_size() {
355        let content = (1..=20)
356            .map(|i| format!("line {}", i))
357            .collect::<Vec<_>>()
358            .join("\n");
359        let file_info = create_test_file_info(&content);
360
361        let chunker = CodeChunker::new(ChunkStrategy::SlidingWindow {
362            size: 10,
363            overlap: 15,
364        });
365        let chunks = chunker.chunk_file(&file_info);
366        // When overlap > size, step should be 1
367        assert!(chunks.len() > 10);
368    }
369
370    #[test]
371    fn test_ast_based_rust() {
372        let content = r#"
373fn hello() {
374    println!("Hello");
375}
376
377fn world() {
378    println!("World");
379}
380"#;
381        let file_info = create_test_file_info(content);
382        let chunker = CodeChunker::new(ChunkStrategy::AstBased);
383        let chunks = chunker.chunk_file(&file_info);
384        // Should extract two functions
385        assert!(chunks.len() >= 2);
386    }
387
388    #[test]
389    fn test_ast_based_no_extension() {
390        let mut file_info = create_test_file_info("fn main() {}");
391        file_info.extension = None;
392        let chunker = CodeChunker::new(ChunkStrategy::AstBased);
393        let chunks = chunker.chunk_file(&file_info);
394        assert_eq!(chunks.len(), 0);
395    }
396
397    #[test]
398    fn test_ast_based_unsupported_language() {
399        let mut file_info = create_test_file_info("some content");
400        file_info.extension = Some("txt".to_string());
401        let chunker = CodeChunker::new(ChunkStrategy::AstBased);
402        let chunks = chunker.chunk_file(&file_info);
403        assert_eq!(chunks.len(), 0);
404    }
405
406    #[test]
407    fn test_hybrid_with_ast_success() {
408        let content = r#"
409fn hello() {
410    println!("Hello");
411}
412"#;
413        let file_info = create_test_file_info(content);
414        let chunker = CodeChunker::new(ChunkStrategy::Hybrid { fallback_lines: 50 });
415        let chunks = chunker.chunk_file(&file_info);
416        // Should use AST parsing
417        assert!(!chunks.is_empty());
418    }
419
420    #[test]
421    fn test_hybrid_fallback_to_fixed() {
422        let mut file_info = create_test_file_info("line 1\nline 2\nline 3");
423        file_info.extension = Some("txt".to_string());
424        let chunker = CodeChunker::new(ChunkStrategy::Hybrid { fallback_lines: 2 });
425        let chunks = chunker.chunk_file(&file_info);
426        // Should fallback to fixed lines since .txt is not supported by AST
427        assert!(!chunks.is_empty());
428    }
429
430    #[test]
431    fn test_metadata_fields() {
432        let mut file_info = create_test_file_info("fn main() {}");
433        file_info.project = Some("test-project".to_string());
434        file_info.hash = "abc123".to_string();
435
436        let chunker = CodeChunker::new(ChunkStrategy::FixedLines(10));
437        let chunks = chunker.chunk_file(&file_info);
438
439        assert_eq!(chunks.len(), 1);
440        let chunk = &chunks[0];
441        assert_eq!(chunk.metadata.file_path, "test.rs");
442        assert_eq!(chunk.metadata.project, Some("test-project".to_string()));
443        assert_eq!(chunk.metadata.language, Some("Rust".to_string()));
444        assert_eq!(chunk.metadata.extension, Some("rs".to_string()));
445        assert_eq!(chunk.metadata.file_hash, "abc123");
446        assert!(chunk.metadata.indexed_at > 0);
447    }
448
449    #[test]
450    fn test_sliding_window_empty_chunks_skipped() {
451        let content = "line 1\n\n\n\nline 5";
452        let file_info = create_test_file_info(content);
453        let chunker = CodeChunker::new(ChunkStrategy::SlidingWindow {
454            size: 2,
455            overlap: 0,
456        });
457        let chunks = chunker.chunk_file(&file_info);
458        // Should skip chunks with only whitespace
459        assert!(!chunks.is_empty());
460        for chunk in chunks {
461            assert!(!chunk.content.trim().is_empty());
462        }
463    }
464
465    #[test]
466    fn test_fixed_lines_empty_chunks_skipped() {
467        let content = "line 1\n\n\nline 4";
468        let file_info = create_test_file_info(content);
469        let chunker = CodeChunker::new(ChunkStrategy::FixedLines(2));
470        let chunks = chunker.chunk_file(&file_info);
471        // Should have chunks but skip empty ones
472        for chunk in chunks {
473            assert!(!chunk.content.trim().is_empty());
474        }
475    }
476
477    #[test]
478    fn test_ast_based_invalid_syntax() {
479        let content = "fn incomplete {"; // Invalid Rust
480        let file_info = create_test_file_info(content);
481        let chunker = CodeChunker::new(ChunkStrategy::AstBased);
482        let chunks = chunker.chunk_file(&file_info);
483        // Should handle parse errors gracefully
484        assert_eq!(chunks.len(), 0);
485    }
486}