infiniloom_engine/chunking/
mod.rs

1//! Intelligent code chunking for LLM context windows
2//!
3//! This module provides various strategies for splitting repositories into
4//! chunks that fit within LLM context windows while preserving semantic coherence.
5
6mod strategies;
7mod types;
8
9use types::SymbolSnippet;
10pub use types::{Chunk, ChunkContext, ChunkFile, ChunkStrategy, Chunker, CrossReference};
11
12use crate::tokenizer::Tokenizer;
13use crate::types::{RepoFile, Repository, SymbolKind, TokenizerModel};
14use std::collections::{BTreeMap, HashMap, HashSet};
15
16impl Chunker {
17    /// Create a new chunker
18    pub fn new(strategy: ChunkStrategy, max_tokens: u32) -> Self {
19        Self { strategy, max_tokens, overlap_tokens: 200, model: TokenizerModel::Claude }
20    }
21
22    /// Set overlap tokens
23    pub fn with_overlap(mut self, tokens: u32) -> Self {
24        self.overlap_tokens = tokens;
25        self
26    }
27
28    /// Set target model
29    pub fn with_model(mut self, model: TokenizerModel) -> Self {
30        self.model = model;
31        self
32    }
33
34    /// Chunk a repository
35    pub fn chunk(&self, repo: &Repository) -> Vec<Chunk> {
36        match self.strategy {
37            ChunkStrategy::Fixed { size } => self.fixed_chunk(repo, size),
38            ChunkStrategy::File => self.file_chunk(repo),
39            ChunkStrategy::Module => self.module_chunk(repo),
40            ChunkStrategy::Symbol => self.symbol_chunk(repo),
41            ChunkStrategy::Semantic => self.semantic_chunk(repo),
42            ChunkStrategy::Dependency => self.dependency_chunk(repo),
43        }
44    }
45
46    // =========================================================================
47    // Chunk creation helpers
48    // =========================================================================
49
50    pub(crate) fn create_chunk(&self, index: usize, files: &[RepoFile], tokens: u32) -> Chunk {
51        let focus = self.determine_focus(files);
52
53        Chunk {
54            index,
55            total: 0, // Updated in finalize
56            focus: focus.clone(),
57            tokens,
58            files: files
59                .iter()
60                .map(|f| ChunkFile {
61                    path: f.relative_path.clone(),
62                    content: f.content.clone().unwrap_or_default(),
63                    tokens: f.token_count.get(self.model),
64                    truncated: false,
65                })
66                .collect(),
67            context: ChunkContext {
68                previous_summary: None,
69                current_focus: focus,
70                next_preview: None,
71                cross_references: Vec::new(),
72                overlap_content: None,
73            },
74        }
75    }
76
77    /// Create a chunk from file references (avoids cloning RepoFile)
78    pub(crate) fn create_chunk_from_refs(
79        &self,
80        index: usize,
81        files: &[&RepoFile],
82        tokens: u32,
83    ) -> Chunk {
84        let focus = self.determine_focus_refs(files);
85
86        Chunk {
87            index,
88            total: 0, // Updated in finalize
89            focus: focus.clone(),
90            tokens,
91            files: files
92                .iter()
93                .map(|f| ChunkFile {
94                    path: f.relative_path.clone(),
95                    content: f.content.clone().unwrap_or_default(),
96                    tokens: f.token_count.get(self.model),
97                    truncated: false,
98                })
99                .collect(),
100            context: ChunkContext {
101                previous_summary: None,
102                current_focus: focus,
103                next_preview: None,
104                cross_references: Vec::new(),
105                overlap_content: None,
106            },
107        }
108    }
109
110    pub(crate) fn build_symbol_chunk(
111        &self,
112        index: usize,
113        snippets: &[SymbolSnippet],
114        tokenizer: &Tokenizer,
115    ) -> Chunk {
116        let focus = self.determine_symbol_focus(snippets);
117        let mut by_file: BTreeMap<&str, Vec<&SymbolSnippet>> = BTreeMap::new();
118
119        for snippet in snippets {
120            by_file
121                .entry(snippet.file_path.as_str())
122                .or_default()
123                .push(snippet);
124        }
125
126        let mut files = Vec::new();
127        let mut total_tokens = 0u32;
128
129        for (path, mut entries) in by_file {
130            entries.sort_by(|a, b| {
131                a.start_line
132                    .cmp(&b.start_line)
133                    .then_with(|| a.symbol_name.cmp(&b.symbol_name))
134            });
135
136            let mut content = String::new();
137            for entry in entries {
138                if !content.is_empty() {
139                    content.push_str("\n\n");
140                }
141                content.push_str(&entry.content);
142            }
143
144            let tokens = tokenizer.count(&content, self.model);
145            total_tokens += tokens;
146
147            files.push(ChunkFile { path: path.to_owned(), content, tokens, truncated: false });
148        }
149
150        Chunk {
151            index,
152            total: 0,
153            focus: focus.clone(),
154            tokens: total_tokens,
155            files,
156            context: ChunkContext {
157                previous_summary: None,
158                current_focus: focus,
159                next_preview: None,
160                cross_references: Vec::new(),
161                overlap_content: None,
162            },
163        }
164    }
165
166    // =========================================================================
167    // Focus determination
168    // =========================================================================
169
170    fn determine_focus(&self, files: &[RepoFile]) -> String {
171        if files.is_empty() {
172            return "Empty".to_owned();
173        }
174
175        // Try to find common directory
176        let first_path = &files[0].relative_path;
177        if let Some(module) = first_path.split('/').next() {
178            if files.iter().all(|f| f.relative_path.starts_with(module)) {
179                return format!("{} module", module);
180            }
181        }
182
183        // Try to find common language
184        if let Some(lang) = &files[0].language {
185            if files.iter().all(|f| f.language.as_ref() == Some(lang)) {
186                return format!("{} files", lang);
187            }
188        }
189
190        "Mixed content".to_owned()
191    }
192
193    /// Determine focus for file references (avoids requiring owned slice)
194    fn determine_focus_refs(&self, files: &[&RepoFile]) -> String {
195        if files.is_empty() {
196            return "Empty".to_owned();
197        }
198
199        // Try to find common directory
200        let first_path = &files[0].relative_path;
201        if let Some(module) = first_path.split('/').next() {
202            if files.iter().all(|f| f.relative_path.starts_with(module)) {
203                return format!("{} module", module);
204            }
205        }
206
207        // Try to find common language
208        if let Some(lang) = &files[0].language {
209            if files.iter().all(|f| f.language.as_ref() == Some(lang)) {
210                return format!("{} files", lang);
211            }
212        }
213
214        "Mixed content".to_owned()
215    }
216
217    fn determine_symbol_focus(&self, snippets: &[SymbolSnippet]) -> String {
218        if snippets.is_empty() {
219            return "Symbols".to_owned();
220        }
221
222        let mut names: Vec<String> = snippets
223            .iter()
224            .take(3)
225            .map(|snippet| snippet.symbol_name.clone())
226            .collect();
227
228        let suffix = if snippets.len() > names.len() {
229            format!(" +{} more", snippets.len() - names.len())
230        } else {
231            String::new()
232        };
233
234        if names.len() == 1 {
235            format!("Symbol: {}{}", names.remove(0), suffix)
236        } else {
237            format!("Symbols: {}{}", names.join(", "), suffix)
238        }
239    }
240
241    // =========================================================================
242    // Overlap and finalization
243    // =========================================================================
244
245    pub(crate) fn get_overlap_files(&self, files: &[RepoFile]) -> Vec<RepoFile> {
246        // Keep files that might be needed for context
247        // For now, just keep the last file if it's small enough
248        files
249            .last()
250            .filter(|f| f.token_count.get(self.model) < self.overlap_tokens)
251            .cloned()
252            .into_iter()
253            .collect()
254    }
255
256    pub(crate) fn finalize_chunks(&self, mut chunks: Vec<Chunk>, repo: &Repository) -> Vec<Chunk> {
257        let total = chunks.len();
258
259        // First pass: collect the focus strings and overlap content we need
260        let focus_strs: Vec<String> = chunks.iter().map(|c| c.focus.clone()).collect();
261
262        // Extract overlap content from each chunk for the next one
263        let overlap_contents: Vec<Option<String>> = if self.overlap_tokens > 0 {
264            chunks
265                .iter()
266                .map(|chunk| self.extract_overlap_content(chunk))
267                .collect()
268        } else {
269            vec![None; chunks.len()]
270        };
271
272        for (i, chunk) in chunks.iter_mut().enumerate() {
273            chunk.total = total;
274
275            // Add previous summary
276            if i > 0 {
277                chunk.context.previous_summary = Some(format!("Previous: {}", focus_strs[i - 1]));
278
279                // Add overlap content from previous chunk
280                if let Some(ref overlap) = overlap_contents[i - 1] {
281                    chunk.context.overlap_content = Some(format!(
282                        "<!-- [OVERLAP FROM PREVIOUS CHUNK] -->\n{}\n<!-- [END OVERLAP] -->",
283                        overlap
284                    ));
285                }
286            }
287
288            // Add next preview
289            if i + 1 < total {
290                chunk.context.next_preview = Some(format!("Next: Chunk {}", i + 2));
291            }
292        }
293
294        self.populate_cross_references(&mut chunks, repo);
295
296        chunks
297    }
298
299    fn populate_cross_references(&self, chunks: &mut [Chunk], repo: &Repository) {
300        const MAX_REFS: usize = 25;
301
302        #[derive(Clone)]
303        struct SymbolLocation {
304            chunk_index: usize,
305            file: String,
306        }
307
308        let file_lookup: HashMap<&str, &RepoFile> = repo
309            .files
310            .iter()
311            .map(|file| (file.relative_path.as_str(), file))
312            .collect();
313
314        let mut symbol_index: HashMap<String, Vec<SymbolLocation>> = HashMap::new();
315        let mut seen_symbols: HashSet<(String, usize, String)> = HashSet::new();
316
317        for (chunk_index, chunk) in chunks.iter().enumerate() {
318            for chunk_file in &chunk.files {
319                if let Some(repo_file) = file_lookup.get(chunk_file.path.as_str()) {
320                    for symbol in &repo_file.symbols {
321                        if symbol.kind == SymbolKind::Import {
322                            continue;
323                        }
324                        let key = (symbol.name.clone(), chunk_index, chunk_file.path.clone());
325                        if seen_symbols.insert(key) {
326                            symbol_index.entry(symbol.name.clone()).or_default().push(
327                                SymbolLocation { chunk_index, file: chunk_file.path.clone() },
328                            );
329                        }
330                    }
331                }
332            }
333        }
334
335        for (chunk_index, chunk) in chunks.iter_mut().enumerate() {
336            let mut refs: Vec<CrossReference> = Vec::new();
337            let mut seen_refs: HashSet<(String, usize, String)> = HashSet::new();
338
339            'files: for chunk_file in &chunk.files {
340                if let Some(repo_file) = file_lookup.get(chunk_file.path.as_str()) {
341                    for symbol in &repo_file.symbols {
342                        for called in &symbol.calls {
343                            if let Some(targets) = symbol_index.get(called) {
344                                for target in targets {
345                                    if target.chunk_index == chunk_index {
346                                        continue;
347                                    }
348                                    let key = (
349                                        called.to_owned(),
350                                        target.chunk_index,
351                                        target.file.clone(),
352                                    );
353                                    if seen_refs.insert(key) {
354                                        refs.push(CrossReference {
355                                            symbol: called.to_owned(),
356                                            chunk_index: target.chunk_index,
357                                            file: target.file.clone(),
358                                        });
359                                        if refs.len() >= MAX_REFS {
360                                            break 'files;
361                                        }
362                                    }
363                                }
364                            }
365                        }
366
367                        if let Some(ref base) = symbol.extends {
368                            if let Some(targets) = symbol_index.get(base) {
369                                for target in targets {
370                                    if target.chunk_index == chunk_index {
371                                        continue;
372                                    }
373                                    let key =
374                                        (base.to_owned(), target.chunk_index, target.file.clone());
375                                    if seen_refs.insert(key) {
376                                        refs.push(CrossReference {
377                                            symbol: base.to_owned(),
378                                            chunk_index: target.chunk_index,
379                                            file: target.file.clone(),
380                                        });
381                                        if refs.len() >= MAX_REFS {
382                                            break 'files;
383                                        }
384                                    }
385                                }
386                            }
387                        }
388
389                        for iface in &symbol.implements {
390                            if let Some(targets) = symbol_index.get(iface) {
391                                for target in targets {
392                                    if target.chunk_index == chunk_index {
393                                        continue;
394                                    }
395                                    let key =
396                                        (iface.to_owned(), target.chunk_index, target.file.clone());
397                                    if seen_refs.insert(key) {
398                                        refs.push(CrossReference {
399                                            symbol: iface.to_owned(),
400                                            chunk_index: target.chunk_index,
401                                            file: target.file.clone(),
402                                        });
403                                        if refs.len() >= MAX_REFS {
404                                            break 'files;
405                                        }
406                                    }
407                                }
408                            }
409                        }
410                    }
411                }
412            }
413
414            refs.sort_by(|a, b| {
415                a.chunk_index
416                    .cmp(&b.chunk_index)
417                    .then_with(|| a.symbol.cmp(&b.symbol))
418                    .then_with(|| a.file.cmp(&b.file))
419            });
420            if refs.len() > MAX_REFS {
421                refs.truncate(MAX_REFS);
422            }
423
424            chunk.context.cross_references = refs;
425        }
426    }
427
428    /// Extract content from the end of a chunk for overlap
429    fn extract_overlap_content(&self, chunk: &Chunk) -> Option<String> {
430        if self.overlap_tokens == 0 || chunk.files.is_empty() {
431            return None;
432        }
433
434        let tokenizer = Tokenizer::new();
435        let mut overlap_parts = Vec::new();
436        let mut remaining_tokens = self.overlap_tokens;
437        let token_model = self.model;
438
439        // Take content from the last files until we've accumulated enough tokens
440        for file in chunk.files.iter().rev() {
441            if remaining_tokens == 0 {
442                break;
443            }
444
445            let file_tokens = tokenizer.count(&file.content, token_model);
446            if file_tokens <= remaining_tokens {
447                // Include entire file
448                overlap_parts.push(format!("// From: {}\n{}", file.path, file.content));
449                remaining_tokens = remaining_tokens.saturating_sub(file_tokens);
450            } else {
451                // Include partial file (last N lines that fit)
452                let lines: Vec<&str> = file.content.lines().collect();
453                let mut partial_lines = Vec::new();
454                let mut partial_tokens = 0u32;
455
456                for line in lines.iter().rev() {
457                    let line_tokens = tokenizer.count(line, token_model);
458                    if partial_tokens + line_tokens > remaining_tokens {
459                        break;
460                    }
461                    partial_lines.push(*line);
462                    partial_tokens += line_tokens;
463                }
464
465                if !partial_lines.is_empty() {
466                    partial_lines.reverse();
467                    let partial_content = partial_lines.join("\n");
468                    overlap_parts
469                        .push(format!("// From: {} (partial)\n{}", file.path, partial_content));
470                }
471                remaining_tokens = 0;
472            }
473        }
474
475        if overlap_parts.is_empty() {
476            None
477        } else {
478            overlap_parts.reverse();
479            Some(overlap_parts.join("\n\n"))
480        }
481    }
482}
483
484#[cfg(test)]
485#[allow(clippy::str_to_string)]
486mod tests {
487    use super::*;
488    use crate::types::{Symbol, SymbolKind, TokenCounts, Visibility};
489
490    fn create_test_repo() -> Repository {
491        let mut repo = Repository::new("test", "/tmp/test");
492
493        for i in 0..5 {
494            repo.files.push(RepoFile {
495                path: format!("/tmp/test/src/file{}.py", i).into(),
496                relative_path: format!("src/file{}.py", i),
497                language: Some("python".to_string()),
498                size_bytes: 1000,
499                token_count: TokenCounts {
500                    o200k: 480,
501                    cl100k: 490,
502                    claude: 500,
503                    gemini: 470,
504                    llama: 460,
505                    mistral: 460,
506                    deepseek: 460,
507                    qwen: 460,
508                    cohere: 465,
509                    grok: 460,
510                },
511                symbols: Vec::new(),
512                importance: 0.5,
513                content: Some(format!("# File {}\ndef func{}(): pass", i, i)),
514            });
515        }
516
517        repo
518    }
519
520    fn create_multi_module_repo() -> Repository {
521        let mut repo = Repository::new("test", "/tmp/test");
522
523        // Module A: 3 files
524        for i in 0..3 {
525            repo.files.push(RepoFile {
526                path: format!("/tmp/test/moduleA/file{}.py", i).into(),
527                relative_path: format!("moduleA/file{}.py", i),
528                language: Some("python".to_string()),
529                size_bytes: 500,
530                token_count: TokenCounts::default_with_value(300),
531                symbols: Vec::new(),
532                importance: 0.5,
533                content: Some(format!("# Module A File {}\ndef funcA{}(): pass", i, i)),
534            });
535        }
536
537        // Module B: 2 files
538        for i in 0..2 {
539            repo.files.push(RepoFile {
540                path: format!("/tmp/test/moduleB/file{}.py", i).into(),
541                relative_path: format!("moduleB/file{}.py", i),
542                language: Some("python".to_string()),
543                size_bytes: 500,
544                token_count: TokenCounts::default_with_value(300),
545                symbols: Vec::new(),
546                importance: 0.5,
547                content: Some(format!("# Module B File {}\ndef funcB{}(): pass", i, i)),
548            });
549        }
550
551        repo
552    }
553
554    fn create_repo_with_imports() -> Repository {
555        let mut repo = Repository::new("test", "/tmp/test");
556
557        // File A imports nothing
558        let mut file_a = RepoFile {
559            path: "/tmp/test/src/utils.py".into(),
560            relative_path: "src/utils.py".to_string(),
561            language: Some("python".to_string()),
562            size_bytes: 500,
563            token_count: TokenCounts::default_with_value(200),
564            symbols: vec![Symbol::new("helper", SymbolKind::Function)],
565            importance: 0.5,
566            content: Some("def helper(): pass".to_string()),
567        };
568        file_a.symbols[0].start_line = 1;
569        file_a.symbols[0].end_line = 1;
570
571        // File B imports from A
572        let mut file_b = RepoFile {
573            path: "/tmp/test/src/main.py".into(),
574            relative_path: "src/main.py".to_string(),
575            language: Some("python".to_string()),
576            size_bytes: 500,
577            token_count: TokenCounts::default_with_value(200),
578            symbols: vec![
579                Symbol::new("src/utils", SymbolKind::Import),
580                Symbol::new("main", SymbolKind::Function),
581            ],
582            importance: 0.8,
583            content: Some("from utils import helper\ndef main(): helper()".to_string()),
584        };
585        file_b.symbols[1].start_line = 2;
586        file_b.symbols[1].end_line = 2;
587        file_b.symbols[1].calls = vec!["helper".to_string()];
588
589        repo.files.push(file_a);
590        repo.files.push(file_b);
591
592        repo
593    }
594
595    // ============================================
596    // Basic Chunking Strategy Tests
597    // ============================================
598
599    #[test]
600    fn test_fixed_chunking() {
601        let repo = create_test_repo();
602        let chunker = Chunker::new(ChunkStrategy::Fixed { size: 1000 }, 1000);
603        let chunks = chunker.chunk(&repo);
604
605        assert!(!chunks.is_empty());
606        assert!(chunks
607            .iter()
608            .all(|c| c.tokens <= 1000 || c.files.len() == 1));
609    }
610
611    #[test]
612    fn test_file_chunking() {
613        let repo = create_test_repo();
614        let chunker = Chunker::new(ChunkStrategy::File, 8000);
615        let chunks = chunker.chunk(&repo);
616
617        assert_eq!(chunks.len(), repo.files.len());
618    }
619
620    #[test]
621    fn test_semantic_chunking() {
622        let repo = create_test_repo();
623        let chunker = Chunker::new(ChunkStrategy::Semantic, 2000);
624        let chunks = chunker.chunk(&repo);
625
626        assert!(!chunks.is_empty());
627        // All chunks should have correct total
628        assert!(chunks.iter().all(|c| c.total == chunks.len()));
629    }
630
631    #[test]
632    fn test_symbol_chunking() {
633        let mut repo = create_test_repo();
634        if let Some(file) = repo.files.get_mut(0) {
635            let mut symbol = Symbol::new("func0", SymbolKind::Function);
636            symbol.start_line = 1;
637            symbol.end_line = 1;
638            symbol.visibility = Visibility::Public;
639            file.symbols.push(symbol);
640        }
641
642        let chunker = Chunker::new(ChunkStrategy::Symbol, 500);
643        let chunks = chunker.chunk(&repo);
644
645        assert!(!chunks.is_empty());
646        assert!(chunks.iter().all(|c| c.total == chunks.len()));
647    }
648
649    // ============================================
650    // Module Chunking Tests
651    // ============================================
652
653    #[test]
654    fn test_module_chunking() {
655        let repo = create_multi_module_repo();
656        let chunker = Chunker::new(ChunkStrategy::Module, 2000);
657        let chunks = chunker.chunk(&repo);
658
659        assert!(!chunks.is_empty());
660        // Should group by module
661        assert!(chunks.iter().all(|c| c.total == chunks.len()));
662    }
663
664    #[test]
665    fn test_module_chunking_respects_max_tokens() {
666        let repo = create_multi_module_repo();
667        // Very small max_tokens to force splitting within modules
668        let chunker = Chunker::new(ChunkStrategy::Module, 400);
669        let chunks = chunker.chunk(&repo);
670
671        assert!(!chunks.is_empty());
672        // Each chunk should respect the token limit (or have single file)
673        for chunk in &chunks {
674            assert!(chunk.tokens <= 400 || chunk.files.len() == 1);
675        }
676    }
677
678    #[test]
679    fn test_module_chunking_large_limit() {
680        let repo = create_multi_module_repo();
681        // Large limit - each module should fit in one chunk
682        let chunker = Chunker::new(ChunkStrategy::Module, 10000);
683        let chunks = chunker.chunk(&repo);
684
685        // Should have 2 chunks (one per module)
686        assert_eq!(chunks.len(), 2);
687    }
688
689    // ============================================
690    // Dependency Chunking Tests
691    // ============================================
692
693    #[test]
694    fn test_dependency_chunking() {
695        let repo = create_repo_with_imports();
696        let chunker = Chunker::new(ChunkStrategy::Dependency, 2000);
697        let chunks = chunker.chunk(&repo);
698
699        assert!(!chunks.is_empty());
700        assert!(chunks.iter().all(|c| c.total == chunks.len()));
701    }
702
703    #[test]
704    fn test_dependency_chunking_order() {
705        let repo = create_repo_with_imports();
706        let chunker = Chunker::new(ChunkStrategy::Dependency, 1000);
707        let chunks = chunker.chunk(&repo);
708
709        // Dependencies should appear before dependents
710        assert!(!chunks.is_empty());
711    }
712
713    #[test]
714    fn test_dependency_chunking_with_cycles() {
715        let mut repo = Repository::new("test", "/tmp/test");
716
717        // Create circular dependency
718        let mut file_a = RepoFile {
719            path: "/tmp/test/a.py".into(),
720            relative_path: "a.py".to_string(),
721            language: Some("python".to_string()),
722            size_bytes: 500,
723            token_count: TokenCounts::default_with_value(200),
724            symbols: vec![
725                Symbol::new("b", SymbolKind::Import),
726                Symbol::new("funcA", SymbolKind::Function),
727            ],
728            importance: 0.5,
729            content: Some("from b import funcB\ndef funcA(): funcB()".to_string()),
730        };
731        file_a.symbols[1].calls = vec!["funcB".to_string()];
732
733        let mut file_b = RepoFile {
734            path: "/tmp/test/b.py".into(),
735            relative_path: "b.py".to_string(),
736            language: Some("python".to_string()),
737            size_bytes: 500,
738            token_count: TokenCounts::default_with_value(200),
739            symbols: vec![
740                Symbol::new("a", SymbolKind::Import),
741                Symbol::new("funcB", SymbolKind::Function),
742            ],
743            importance: 0.5,
744            content: Some("from a import funcA\ndef funcB(): funcA()".to_string()),
745        };
746        file_b.symbols[1].calls = vec!["funcA".to_string()];
747
748        repo.files.push(file_a);
749        repo.files.push(file_b);
750
751        let chunker = Chunker::new(ChunkStrategy::Dependency, 1000);
752        let chunks = chunker.chunk(&repo);
753
754        // Should handle cycles gracefully
755        assert!(!chunks.is_empty());
756        // All files should be included
757        let total_files: usize = chunks.iter().map(|c| c.files.len()).sum();
758        assert_eq!(total_files, 2);
759    }
760
761    // ============================================
762    // Symbol Chunking Edge Cases
763    // ============================================
764
765    #[test]
766    fn test_symbol_chunking_no_symbols() {
767        let repo = create_test_repo(); // No symbols
768        let chunker = Chunker::new(ChunkStrategy::Symbol, 500);
769        let chunks = chunker.chunk(&repo);
770
771        // Should fall back to semantic chunking
772        assert!(!chunks.is_empty());
773    }
774
775    #[test]
776    fn test_symbol_chunking_with_imports() {
777        let mut repo = create_test_repo();
778        // Add imports (should be skipped)
779        if let Some(file) = repo.files.get_mut(0) {
780            file.symbols.push(Symbol::new("os", SymbolKind::Import));
781            file.symbols.push(Symbol::new("sys", SymbolKind::Import));
782            let mut func = Symbol::new("func0", SymbolKind::Function);
783            func.start_line = 3;
784            func.end_line = 5;
785            file.symbols.push(func);
786        }
787
788        let chunker = Chunker::new(ChunkStrategy::Symbol, 1000);
789        let chunks = chunker.chunk(&repo);
790
791        assert!(!chunks.is_empty());
792    }
793
794    #[test]
795    fn test_symbol_chunking_multiple_symbols_per_file() {
796        let mut repo = Repository::new("test", "/tmp/test");
797        let mut file = RepoFile {
798            path: "/tmp/test/main.py".into(),
799            relative_path: "main.py".to_string(),
800            language: Some("python".to_string()),
801            size_bytes: 1000,
802            token_count: TokenCounts::default_with_value(500),
803            symbols: Vec::new(),
804            importance: 0.8,
805            content: Some("def func1(): pass\ndef func2(): pass\ndef func3(): pass".to_string()),
806        };
807
808        for i in 1..=3 {
809            let mut sym = Symbol::new(&format!("func{}", i), SymbolKind::Function);
810            sym.start_line = i;
811            sym.end_line = i;
812            sym.importance = 0.9 - (i as f32 * 0.1);
813            file.symbols.push(sym);
814        }
815        repo.files.push(file);
816
817        let chunker = Chunker::new(ChunkStrategy::Symbol, 2000);
818        let chunks = chunker.chunk(&repo);
819
820        assert!(!chunks.is_empty());
821    }
822
823    // ============================================
824    // Chunker Builder Tests
825    // ============================================
826
827    #[test]
828    fn test_chunker_with_overlap() {
829        let chunker = Chunker::new(ChunkStrategy::Fixed { size: 1000 }, 1000).with_overlap(500);
830        assert_eq!(chunker.overlap_tokens, 500);
831    }
832
833    #[test]
834    fn test_chunker_with_model() {
835        let chunker = Chunker::new(ChunkStrategy::Fixed { size: 1000 }, 1000)
836            .with_model(TokenizerModel::Gpt4o);
837        assert_eq!(chunker.model, TokenizerModel::Gpt4o);
838    }
839
840    #[test]
841    fn test_chunker_builder_chain() {
842        let chunker = Chunker::new(ChunkStrategy::Semantic, 2000)
843            .with_overlap(300)
844            .with_model(TokenizerModel::Gemini);
845
846        assert_eq!(chunker.overlap_tokens, 300);
847        assert_eq!(chunker.model, TokenizerModel::Gemini);
848        assert!(matches!(chunker.strategy, ChunkStrategy::Semantic));
849    }
850
851    // ============================================
852    // Focus Determination Tests
853    // ============================================
854
855    #[test]
856    fn test_determine_focus_empty() {
857        let chunker = Chunker::new(ChunkStrategy::File, 1000);
858        let files: Vec<RepoFile> = vec![];
859        let focus = chunker.determine_focus(&files);
860        assert_eq!(focus, "Empty");
861    }
862
863    #[test]
864    fn test_determine_focus_common_module() {
865        let repo = create_multi_module_repo();
866        let chunker = Chunker::new(ChunkStrategy::File, 1000);
867        // Get just moduleA files
868        let module_a_files: Vec<RepoFile> = repo
869            .files
870            .iter()
871            .filter(|f| f.relative_path.starts_with("moduleA"))
872            .cloned()
873            .collect();
874
875        let focus = chunker.determine_focus(&module_a_files);
876        assert!(focus.contains("moduleA"));
877    }
878
879    #[test]
880    fn test_determine_focus_common_language() {
881        let mut repo = Repository::new("test", "/tmp/test");
882        for i in 0..3 {
883            repo.files.push(RepoFile {
884                path: format!("/tmp/test/dir{}/file.rs", i).into(),
885                relative_path: format!("dir{}/file.rs", i),
886                language: Some("rust".to_string()),
887                size_bytes: 500,
888                token_count: TokenCounts::default_with_value(200),
889                symbols: Vec::new(),
890                importance: 0.5,
891                content: Some("fn main() {}".to_string()),
892            });
893        }
894
895        let chunker = Chunker::new(ChunkStrategy::File, 1000);
896        let focus = chunker.determine_focus(&repo.files);
897        assert!(focus.contains("rust") || focus.contains("Mixed"));
898    }
899
900    // ============================================
901    // Chunk Context Tests
902    // ============================================
903
904    #[test]
905    fn test_chunk_context_previous_summary() {
906        let repo = create_test_repo();
907        let chunker = Chunker::new(ChunkStrategy::Fixed { size: 600 }, 600);
908        let chunks = chunker.chunk(&repo);
909
910        if chunks.len() > 1 {
911            // First chunk has no previous
912            assert!(chunks[0].context.previous_summary.is_none());
913            // Second chunk should have previous
914            assert!(chunks[1].context.previous_summary.is_some());
915        }
916    }
917
918    #[test]
919    fn test_chunk_context_next_preview() {
920        let repo = create_test_repo();
921        let chunker = Chunker::new(ChunkStrategy::Fixed { size: 600 }, 600);
922        let chunks = chunker.chunk(&repo);
923
924        if chunks.len() > 1 {
925            // First chunk should have next preview
926            assert!(chunks[0].context.next_preview.is_some());
927            // Last chunk has no next
928            assert!(chunks.last().unwrap().context.next_preview.is_none());
929        }
930    }
931
932    // ============================================
933    // Overlap Content Tests
934    // ============================================
935
936    #[test]
937    fn test_extract_overlap_content() {
938        let repo = create_test_repo();
939        let chunker = Chunker::new(ChunkStrategy::Fixed { size: 600 }, 600).with_overlap(100);
940        let chunks = chunker.chunk(&repo);
941
942        // If there are multiple chunks, later ones should have overlap
943        if chunks.len() > 1 {
944            // Overlap is added during finalization
945            // Just verify chunking completes without error
946            assert!(chunks.iter().all(|c| c.total == chunks.len()));
947        }
948    }
949
950    #[test]
951    fn test_no_overlap_when_zero() {
952        let repo = create_test_repo();
953        let chunker = Chunker::new(ChunkStrategy::Fixed { size: 600 }, 600).with_overlap(0);
954        let chunks = chunker.chunk(&repo);
955
956        // No overlap content should be added
957        for chunk in &chunks {
958            assert!(chunk.context.overlap_content.is_none());
959        }
960    }
961
962    // ============================================
963    // Cross Reference Tests
964    // ============================================
965
966    #[test]
967    fn test_cross_references_populated() {
968        let repo = create_repo_with_imports();
969        let chunker = Chunker::new(ChunkStrategy::File, 1000);
970        let chunks = chunker.chunk(&repo);
971
972        // Cross references should be populated during finalization
973        assert!(!chunks.is_empty());
974    }
975
976    // ============================================
977    // Empty Repository Tests
978    // ============================================
979
980    #[test]
981    fn test_fixed_chunking_empty_repo() {
982        let repo = Repository::new("empty", "/tmp/empty");
983        let chunker = Chunker::new(ChunkStrategy::Fixed { size: 1000 }, 1000);
984        let chunks = chunker.chunk(&repo);
985        assert!(chunks.is_empty());
986    }
987
988    #[test]
989    fn test_module_chunking_empty_repo() {
990        let repo = Repository::new("empty", "/tmp/empty");
991        let chunker = Chunker::new(ChunkStrategy::Module, 1000);
992        let chunks = chunker.chunk(&repo);
993        assert!(chunks.is_empty());
994    }
995
996    #[test]
997    fn test_dependency_chunking_empty_repo() {
998        let repo = Repository::new("empty", "/tmp/empty");
999        let chunker = Chunker::new(ChunkStrategy::Dependency, 1000);
1000        let chunks = chunker.chunk(&repo);
1001        assert!(chunks.is_empty());
1002    }
1003
1004    // ============================================
1005    // Large File Tests
1006    // ============================================
1007
1008    #[test]
1009    fn test_fixed_chunking_single_large_file() {
1010        let mut repo = Repository::new("test", "/tmp/test");
1011        repo.files.push(RepoFile {
1012            path: "/tmp/test/large.py".into(),
1013            relative_path: "large.py".to_string(),
1014            language: Some("python".to_string()),
1015            size_bytes: 50000,
1016            token_count: TokenCounts::default_with_value(10000),
1017            symbols: Vec::new(),
1018            importance: 0.5,
1019            content: Some("x = 1\n".repeat(1000)),
1020        });
1021
1022        let chunker = Chunker::new(ChunkStrategy::Fixed { size: 500 }, 500);
1023        let chunks = chunker.chunk(&repo);
1024
1025        // Large file should be in its own chunk
1026        assert!(!chunks.is_empty());
1027    }
1028
1029    // ============================================
1030    // Chunk Total Count Tests
1031    // ============================================
1032
1033    #[test]
1034    fn test_chunk_total_is_correct() {
1035        let repo = create_test_repo();
1036        let chunker = Chunker::new(ChunkStrategy::Fixed { size: 600 }, 600);
1037        let chunks = chunker.chunk(&repo);
1038
1039        let expected_total = chunks.len();
1040        for chunk in &chunks {
1041            assert_eq!(chunk.total, expected_total);
1042        }
1043    }
1044
1045    #[test]
1046    fn test_chunk_index_is_sequential() {
1047        let repo = create_test_repo();
1048        let chunker = Chunker::new(ChunkStrategy::Fixed { size: 600 }, 600);
1049        let chunks = chunker.chunk(&repo);
1050
1051        for (i, chunk) in chunks.iter().enumerate() {
1052            assert_eq!(chunk.index, i);
1053        }
1054    }
1055}