infiniloom_engine/chunking/
mod.rs

1//! Intelligent code chunking for LLM context windows
2//!
3//! This module provides various strategies for splitting repositories into
4//! chunks that fit within LLM context windows while preserving semantic coherence.
5
6mod strategies;
7mod types;
8
9use types::SymbolSnippet;
10pub use types::{Chunk, ChunkContext, ChunkFile, ChunkStrategy, Chunker, CrossReference};
11
12use crate::tokenizer::Tokenizer;
13use crate::types::{RepoFile, Repository, SymbolKind, TokenizerModel};
14use std::collections::{BTreeMap, HashMap, HashSet};
15
16/// Determine focus description from an iterator of RepoFile references
17fn determine_focus_impl<'a>(mut files: impl Iterator<Item = &'a RepoFile>) -> String {
18    let first = match files.next() {
19        Some(f) => f,
20        None => return "Empty".to_owned(),
21    };
22
23    // Collect remaining for iteration (we've consumed first)
24    let rest: Vec<&RepoFile> = files.collect();
25
26    // Try to find common directory
27    if let Some(module) = first.relative_path.split('/').next() {
28        if rest.iter().all(|f| f.relative_path.starts_with(module)) {
29            return format!("{} module", module);
30        }
31    }
32
33    // Try to find common language
34    if let Some(lang) = &first.language {
35        if rest.iter().all(|f| f.language.as_ref() == Some(lang)) {
36            return format!("{} files", lang);
37        }
38    }
39
40    "Mixed content".to_owned()
41}
42
43impl Chunker {
44    /// Create a new chunker
45    pub fn new(strategy: ChunkStrategy, max_tokens: u32) -> Self {
46        Self { strategy, max_tokens, overlap_tokens: 200, model: TokenizerModel::Claude }
47    }
48
49    /// Set overlap tokens
50    pub fn with_overlap(mut self, tokens: u32) -> Self {
51        self.overlap_tokens = tokens;
52        self
53    }
54
55    /// Set target model
56    pub fn with_model(mut self, model: TokenizerModel) -> Self {
57        self.model = model;
58        self
59    }
60
61    /// Chunk a repository
62    pub fn chunk(&self, repo: &Repository) -> Vec<Chunk> {
63        match self.strategy {
64            ChunkStrategy::Fixed { size } => self.fixed_chunk(repo, size),
65            ChunkStrategy::File => self.file_chunk(repo),
66            ChunkStrategy::Module => self.module_chunk(repo),
67            ChunkStrategy::Symbol => self.symbol_chunk(repo),
68            ChunkStrategy::Semantic => self.semantic_chunk(repo),
69            ChunkStrategy::Dependency => self.dependency_chunk(repo),
70        }
71    }
72
73    // =========================================================================
74    // Chunk creation helpers
75    // =========================================================================
76
77    pub(crate) fn create_chunk(&self, index: usize, files: &[RepoFile], tokens: u32) -> Chunk {
78        let focus = self.determine_focus(files);
79
80        Chunk {
81            index,
82            total: 0, // Updated in finalize
83            focus: focus.clone(),
84            tokens,
85            files: files
86                .iter()
87                .map(|f| ChunkFile {
88                    path: f.relative_path.clone(),
89                    content: f.content.clone().unwrap_or_default(),
90                    tokens: f.token_count.get(self.model),
91                    truncated: false,
92                })
93                .collect(),
94            context: ChunkContext {
95                previous_summary: None,
96                current_focus: focus,
97                next_preview: None,
98                cross_references: Vec::new(),
99                overlap_content: None,
100            },
101        }
102    }
103
104    /// Create a chunk from file references (avoids cloning RepoFile)
105    pub(crate) fn create_chunk_from_refs(
106        &self,
107        index: usize,
108        files: &[&RepoFile],
109        tokens: u32,
110    ) -> Chunk {
111        let focus = self.determine_focus_refs(files);
112
113        Chunk {
114            index,
115            total: 0, // Updated in finalize
116            focus: focus.clone(),
117            tokens,
118            files: files
119                .iter()
120                .map(|f| ChunkFile {
121                    path: f.relative_path.clone(),
122                    content: f.content.clone().unwrap_or_default(),
123                    tokens: f.token_count.get(self.model),
124                    truncated: false,
125                })
126                .collect(),
127            context: ChunkContext {
128                previous_summary: None,
129                current_focus: focus,
130                next_preview: None,
131                cross_references: Vec::new(),
132                overlap_content: None,
133            },
134        }
135    }
136
137    pub(crate) fn build_symbol_chunk(
138        &self,
139        index: usize,
140        snippets: &[SymbolSnippet],
141        tokenizer: &Tokenizer,
142    ) -> Chunk {
143        let focus = self.determine_symbol_focus(snippets);
144        let mut by_file: BTreeMap<&str, Vec<&SymbolSnippet>> = BTreeMap::new();
145
146        for snippet in snippets {
147            by_file
148                .entry(snippet.file_path.as_str())
149                .or_default()
150                .push(snippet);
151        }
152
153        let mut files = Vec::new();
154        let mut total_tokens = 0u32;
155
156        for (path, mut entries) in by_file {
157            entries.sort_by(|a, b| {
158                a.start_line
159                    .cmp(&b.start_line)
160                    .then_with(|| a.symbol_name.cmp(&b.symbol_name))
161            });
162
163            let mut content = String::new();
164            for entry in entries {
165                if !content.is_empty() {
166                    content.push_str("\n\n");
167                }
168                content.push_str(&entry.content);
169            }
170
171            let tokens = tokenizer.count(&content, self.model);
172            total_tokens += tokens;
173
174            files.push(ChunkFile { path: path.to_owned(), content, tokens, truncated: false });
175        }
176
177        Chunk {
178            index,
179            total: 0,
180            focus: focus.clone(),
181            tokens: total_tokens,
182            files,
183            context: ChunkContext {
184                previous_summary: None,
185                current_focus: focus,
186                next_preview: None,
187                cross_references: Vec::new(),
188                overlap_content: None,
189            },
190        }
191    }
192
193    // =========================================================================
194    // Focus determination
195    // =========================================================================
196
197    fn determine_focus(&self, files: &[RepoFile]) -> String {
198        determine_focus_impl(files.iter())
199    }
200
201    /// Determine focus for file references (avoids requiring owned slice)
202    fn determine_focus_refs(&self, files: &[&RepoFile]) -> String {
203        determine_focus_impl(files.iter().copied())
204    }
205
206    fn determine_symbol_focus(&self, snippets: &[SymbolSnippet]) -> String {
207        if snippets.is_empty() {
208            return "Symbols".to_owned();
209        }
210
211        let mut names: Vec<String> = snippets
212            .iter()
213            .take(3)
214            .map(|snippet| snippet.symbol_name.clone())
215            .collect();
216
217        let suffix = if snippets.len() > names.len() {
218            format!(" +{} more", snippets.len() - names.len())
219        } else {
220            String::new()
221        };
222
223        if names.len() == 1 {
224            format!("Symbol: {}{}", names.remove(0), suffix)
225        } else {
226            format!("Symbols: {}{}", names.join(", "), suffix)
227        }
228    }
229
230    // =========================================================================
231    // Overlap and finalization
232    // =========================================================================
233
234    pub(crate) fn get_overlap_files(&self, files: &[RepoFile]) -> Vec<RepoFile> {
235        // Keep files that might be needed for context
236        // For now, just keep the last file if it's small enough
237        files
238            .last()
239            .filter(|f| f.token_count.get(self.model) < self.overlap_tokens)
240            .cloned()
241            .into_iter()
242            .collect()
243    }
244
245    pub(crate) fn finalize_chunks(&self, mut chunks: Vec<Chunk>, repo: &Repository) -> Vec<Chunk> {
246        let total = chunks.len();
247
248        // First pass: collect the focus strings and overlap content we need
249        let focus_strs: Vec<String> = chunks.iter().map(|c| c.focus.clone()).collect();
250
251        // Extract overlap content from each chunk for the next one
252        let overlap_contents: Vec<Option<String>> = if self.overlap_tokens > 0 {
253            chunks
254                .iter()
255                .map(|chunk| self.extract_overlap_content(chunk))
256                .collect()
257        } else {
258            vec![None; chunks.len()]
259        };
260
261        for (i, chunk) in chunks.iter_mut().enumerate() {
262            chunk.total = total;
263
264            // Add previous summary
265            if i > 0 {
266                chunk.context.previous_summary = Some(format!("Previous: {}", focus_strs[i - 1]));
267
268                // Add overlap content from previous chunk
269                if let Some(ref overlap) = overlap_contents[i - 1] {
270                    chunk.context.overlap_content = Some(format!(
271                        "<!-- [OVERLAP FROM PREVIOUS CHUNK] -->\n{}\n<!-- [END OVERLAP] -->",
272                        overlap
273                    ));
274                }
275            }
276
277            // Add next preview
278            if i + 1 < total {
279                chunk.context.next_preview = Some(format!("Next: Chunk {}", i + 2));
280            }
281        }
282
283        self.populate_cross_references(&mut chunks, repo);
284
285        chunks
286    }
287
288    fn populate_cross_references(&self, chunks: &mut [Chunk], repo: &Repository) {
289        const MAX_REFS: usize = 25;
290
291        #[derive(Clone)]
292        struct SymbolLocation {
293            chunk_index: usize,
294            file: String,
295        }
296
297        let file_lookup: HashMap<&str, &RepoFile> = repo
298            .files
299            .iter()
300            .map(|file| (file.relative_path.as_str(), file))
301            .collect();
302
303        let mut symbol_index: HashMap<String, Vec<SymbolLocation>> = HashMap::new();
304        let mut seen_symbols: HashSet<(String, usize, String)> = HashSet::new();
305
306        for (chunk_index, chunk) in chunks.iter().enumerate() {
307            for chunk_file in &chunk.files {
308                if let Some(repo_file) = file_lookup.get(chunk_file.path.as_str()) {
309                    for symbol in &repo_file.symbols {
310                        if symbol.kind == SymbolKind::Import {
311                            continue;
312                        }
313                        let key = (symbol.name.clone(), chunk_index, chunk_file.path.clone());
314                        if seen_symbols.insert(key) {
315                            symbol_index.entry(symbol.name.clone()).or_default().push(
316                                SymbolLocation { chunk_index, file: chunk_file.path.clone() },
317                            );
318                        }
319                    }
320                }
321            }
322        }
323
324        for (chunk_index, chunk) in chunks.iter_mut().enumerate() {
325            let mut refs: Vec<CrossReference> = Vec::new();
326            let mut seen_refs: HashSet<(String, usize, String)> = HashSet::new();
327
328            'files: for chunk_file in &chunk.files {
329                if let Some(repo_file) = file_lookup.get(chunk_file.path.as_str()) {
330                    for symbol in &repo_file.symbols {
331                        for called in &symbol.calls {
332                            if let Some(targets) = symbol_index.get(called) {
333                                for target in targets {
334                                    if target.chunk_index == chunk_index {
335                                        continue;
336                                    }
337                                    let key = (
338                                        called.to_owned(),
339                                        target.chunk_index,
340                                        target.file.clone(),
341                                    );
342                                    if seen_refs.insert(key) {
343                                        refs.push(CrossReference {
344                                            symbol: called.to_owned(),
345                                            chunk_index: target.chunk_index,
346                                            file: target.file.clone(),
347                                        });
348                                        if refs.len() >= MAX_REFS {
349                                            break 'files;
350                                        }
351                                    }
352                                }
353                            }
354                        }
355
356                        if let Some(ref base) = symbol.extends {
357                            if let Some(targets) = symbol_index.get(base) {
358                                for target in targets {
359                                    if target.chunk_index == chunk_index {
360                                        continue;
361                                    }
362                                    let key =
363                                        (base.to_owned(), target.chunk_index, target.file.clone());
364                                    if seen_refs.insert(key) {
365                                        refs.push(CrossReference {
366                                            symbol: base.to_owned(),
367                                            chunk_index: target.chunk_index,
368                                            file: target.file.clone(),
369                                        });
370                                        if refs.len() >= MAX_REFS {
371                                            break 'files;
372                                        }
373                                    }
374                                }
375                            }
376                        }
377
378                        for iface in &symbol.implements {
379                            if let Some(targets) = symbol_index.get(iface) {
380                                for target in targets {
381                                    if target.chunk_index == chunk_index {
382                                        continue;
383                                    }
384                                    let key =
385                                        (iface.to_owned(), target.chunk_index, target.file.clone());
386                                    if seen_refs.insert(key) {
387                                        refs.push(CrossReference {
388                                            symbol: iface.to_owned(),
389                                            chunk_index: target.chunk_index,
390                                            file: target.file.clone(),
391                                        });
392                                        if refs.len() >= MAX_REFS {
393                                            break 'files;
394                                        }
395                                    }
396                                }
397                            }
398                        }
399                    }
400                }
401            }
402
403            refs.sort_by(|a, b| {
404                a.chunk_index
405                    .cmp(&b.chunk_index)
406                    .then_with(|| a.symbol.cmp(&b.symbol))
407                    .then_with(|| a.file.cmp(&b.file))
408            });
409            if refs.len() > MAX_REFS {
410                refs.truncate(MAX_REFS);
411            }
412
413            chunk.context.cross_references = refs;
414        }
415    }
416
417    /// Extract content from the end of a chunk for overlap
418    fn extract_overlap_content(&self, chunk: &Chunk) -> Option<String> {
419        if self.overlap_tokens == 0 || chunk.files.is_empty() {
420            return None;
421        }
422
423        let tokenizer = Tokenizer::new();
424        let mut overlap_parts = Vec::new();
425        let mut remaining_tokens = self.overlap_tokens;
426        let token_model = self.model;
427
428        // Take content from the last files until we've accumulated enough tokens
429        for file in chunk.files.iter().rev() {
430            if remaining_tokens == 0 {
431                break;
432            }
433
434            let file_tokens = tokenizer.count(&file.content, token_model);
435            if file_tokens <= remaining_tokens {
436                // Include entire file
437                overlap_parts.push(format!("// From: {}\n{}", file.path, file.content));
438                remaining_tokens = remaining_tokens.saturating_sub(file_tokens);
439            } else {
440                // Include partial file (last N lines that fit)
441                let lines: Vec<&str> = file.content.lines().collect();
442                let mut partial_lines = Vec::new();
443                let mut partial_tokens = 0u32;
444
445                for line in lines.iter().rev() {
446                    let line_tokens = tokenizer.count(line, token_model);
447                    if partial_tokens + line_tokens > remaining_tokens {
448                        break;
449                    }
450                    partial_lines.push(*line);
451                    partial_tokens += line_tokens;
452                }
453
454                if !partial_lines.is_empty() {
455                    partial_lines.reverse();
456                    let partial_content = partial_lines.join("\n");
457                    overlap_parts
458                        .push(format!("// From: {} (partial)\n{}", file.path, partial_content));
459                }
460                remaining_tokens = 0;
461            }
462        }
463
464        if overlap_parts.is_empty() {
465            None
466        } else {
467            overlap_parts.reverse();
468            Some(overlap_parts.join("\n\n"))
469        }
470    }
471}
472
473#[cfg(test)]
474#[allow(clippy::str_to_string)]
475mod tests {
476    use super::*;
477    use crate::types::{Symbol, SymbolKind, TokenCounts, Visibility};
478
479    fn create_test_repo() -> Repository {
480        let mut repo = Repository::new("test", "/tmp/test");
481
482        for i in 0..5 {
483            repo.files.push(RepoFile {
484                path: format!("/tmp/test/src/file{}.py", i).into(),
485                relative_path: format!("src/file{}.py", i),
486                language: Some("python".to_string()),
487                size_bytes: 1000,
488                token_count: TokenCounts {
489                    o200k: 480,
490                    cl100k: 490,
491                    claude: 500,
492                    gemini: 470,
493                    llama: 460,
494                    mistral: 460,
495                    deepseek: 460,
496                    qwen: 460,
497                    cohere: 465,
498                    grok: 460,
499                },
500                symbols: Vec::new(),
501                importance: 0.5,
502                content: Some(format!("# File {}\ndef func{}(): pass", i, i)),
503            });
504        }
505
506        repo
507    }
508
509    fn create_multi_module_repo() -> Repository {
510        let mut repo = Repository::new("test", "/tmp/test");
511
512        // Module A: 3 files
513        for i in 0..3 {
514            repo.files.push(RepoFile {
515                path: format!("/tmp/test/moduleA/file{}.py", i).into(),
516                relative_path: format!("moduleA/file{}.py", i),
517                language: Some("python".to_string()),
518                size_bytes: 500,
519                token_count: TokenCounts::default_with_value(300),
520                symbols: Vec::new(),
521                importance: 0.5,
522                content: Some(format!("# Module A File {}\ndef funcA{}(): pass", i, i)),
523            });
524        }
525
526        // Module B: 2 files
527        for i in 0..2 {
528            repo.files.push(RepoFile {
529                path: format!("/tmp/test/moduleB/file{}.py", i).into(),
530                relative_path: format!("moduleB/file{}.py", i),
531                language: Some("python".to_string()),
532                size_bytes: 500,
533                token_count: TokenCounts::default_with_value(300),
534                symbols: Vec::new(),
535                importance: 0.5,
536                content: Some(format!("# Module B File {}\ndef funcB{}(): pass", i, i)),
537            });
538        }
539
540        repo
541    }
542
543    fn create_repo_with_imports() -> Repository {
544        let mut repo = Repository::new("test", "/tmp/test");
545
546        // File A imports nothing
547        let mut file_a = RepoFile {
548            path: "/tmp/test/src/utils.py".into(),
549            relative_path: "src/utils.py".to_string(),
550            language: Some("python".to_string()),
551            size_bytes: 500,
552            token_count: TokenCounts::default_with_value(200),
553            symbols: vec![Symbol::new("helper", SymbolKind::Function)],
554            importance: 0.5,
555            content: Some("def helper(): pass".to_string()),
556        };
557        file_a.symbols[0].start_line = 1;
558        file_a.symbols[0].end_line = 1;
559
560        // File B imports from A
561        let mut file_b = RepoFile {
562            path: "/tmp/test/src/main.py".into(),
563            relative_path: "src/main.py".to_string(),
564            language: Some("python".to_string()),
565            size_bytes: 500,
566            token_count: TokenCounts::default_with_value(200),
567            symbols: vec![
568                Symbol::new("src/utils", SymbolKind::Import),
569                Symbol::new("main", SymbolKind::Function),
570            ],
571            importance: 0.8,
572            content: Some("from utils import helper\ndef main(): helper()".to_string()),
573        };
574        file_b.symbols[1].start_line = 2;
575        file_b.symbols[1].end_line = 2;
576        file_b.symbols[1].calls = vec!["helper".to_string()];
577
578        repo.files.push(file_a);
579        repo.files.push(file_b);
580
581        repo
582    }
583
584    // ============================================
585    // Basic Chunking Strategy Tests
586    // ============================================
587
588    #[test]
589    fn test_fixed_chunking() {
590        let repo = create_test_repo();
591        let chunker = Chunker::new(ChunkStrategy::Fixed { size: 1000 }, 1000);
592        let chunks = chunker.chunk(&repo);
593
594        assert!(!chunks.is_empty());
595        assert!(chunks
596            .iter()
597            .all(|c| c.tokens <= 1000 || c.files.len() == 1));
598    }
599
600    #[test]
601    fn test_file_chunking() {
602        let repo = create_test_repo();
603        let chunker = Chunker::new(ChunkStrategy::File, 8000);
604        let chunks = chunker.chunk(&repo);
605
606        assert_eq!(chunks.len(), repo.files.len());
607    }
608
609    #[test]
610    fn test_semantic_chunking() {
611        let repo = create_test_repo();
612        let chunker = Chunker::new(ChunkStrategy::Semantic, 2000);
613        let chunks = chunker.chunk(&repo);
614
615        assert!(!chunks.is_empty());
616        // All chunks should have correct total
617        assert!(chunks.iter().all(|c| c.total == chunks.len()));
618    }
619
620    #[test]
621    fn test_symbol_chunking() {
622        let mut repo = create_test_repo();
623        if let Some(file) = repo.files.get_mut(0) {
624            let mut symbol = Symbol::new("func0", SymbolKind::Function);
625            symbol.start_line = 1;
626            symbol.end_line = 1;
627            symbol.visibility = Visibility::Public;
628            file.symbols.push(symbol);
629        }
630
631        let chunker = Chunker::new(ChunkStrategy::Symbol, 500);
632        let chunks = chunker.chunk(&repo);
633
634        assert!(!chunks.is_empty());
635        assert!(chunks.iter().all(|c| c.total == chunks.len()));
636    }
637
638    // ============================================
639    // Module Chunking Tests
640    // ============================================
641
642    #[test]
643    fn test_module_chunking() {
644        let repo = create_multi_module_repo();
645        let chunker = Chunker::new(ChunkStrategy::Module, 2000);
646        let chunks = chunker.chunk(&repo);
647
648        assert!(!chunks.is_empty());
649        // Should group by module
650        assert!(chunks.iter().all(|c| c.total == chunks.len()));
651    }
652
653    #[test]
654    fn test_module_chunking_respects_max_tokens() {
655        let repo = create_multi_module_repo();
656        // Very small max_tokens to force splitting within modules
657        let chunker = Chunker::new(ChunkStrategy::Module, 400);
658        let chunks = chunker.chunk(&repo);
659
660        assert!(!chunks.is_empty());
661        // Each chunk should respect the token limit (or have single file)
662        for chunk in &chunks {
663            assert!(chunk.tokens <= 400 || chunk.files.len() == 1);
664        }
665    }
666
667    #[test]
668    fn test_module_chunking_large_limit() {
669        let repo = create_multi_module_repo();
670        // Large limit - each module should fit in one chunk
671        let chunker = Chunker::new(ChunkStrategy::Module, 10000);
672        let chunks = chunker.chunk(&repo);
673
674        // Should have 2 chunks (one per module)
675        assert_eq!(chunks.len(), 2);
676    }
677
678    // ============================================
679    // Dependency Chunking Tests
680    // ============================================
681
682    #[test]
683    fn test_dependency_chunking() {
684        let repo = create_repo_with_imports();
685        let chunker = Chunker::new(ChunkStrategy::Dependency, 2000);
686        let chunks = chunker.chunk(&repo);
687
688        assert!(!chunks.is_empty());
689        assert!(chunks.iter().all(|c| c.total == chunks.len()));
690    }
691
692    #[test]
693    fn test_dependency_chunking_order() {
694        let repo = create_repo_with_imports();
695        let chunker = Chunker::new(ChunkStrategy::Dependency, 1000);
696        let chunks = chunker.chunk(&repo);
697
698        // Dependencies should appear before dependents
699        assert!(!chunks.is_empty());
700    }
701
702    #[test]
703    fn test_dependency_chunking_with_cycles() {
704        let mut repo = Repository::new("test", "/tmp/test");
705
706        // Create circular dependency
707        let mut file_a = RepoFile {
708            path: "/tmp/test/a.py".into(),
709            relative_path: "a.py".to_string(),
710            language: Some("python".to_string()),
711            size_bytes: 500,
712            token_count: TokenCounts::default_with_value(200),
713            symbols: vec![
714                Symbol::new("b", SymbolKind::Import),
715                Symbol::new("funcA", SymbolKind::Function),
716            ],
717            importance: 0.5,
718            content: Some("from b import funcB\ndef funcA(): funcB()".to_string()),
719        };
720        file_a.symbols[1].calls = vec!["funcB".to_string()];
721
722        let mut file_b = RepoFile {
723            path: "/tmp/test/b.py".into(),
724            relative_path: "b.py".to_string(),
725            language: Some("python".to_string()),
726            size_bytes: 500,
727            token_count: TokenCounts::default_with_value(200),
728            symbols: vec![
729                Symbol::new("a", SymbolKind::Import),
730                Symbol::new("funcB", SymbolKind::Function),
731            ],
732            importance: 0.5,
733            content: Some("from a import funcA\ndef funcB(): funcA()".to_string()),
734        };
735        file_b.symbols[1].calls = vec!["funcA".to_string()];
736
737        repo.files.push(file_a);
738        repo.files.push(file_b);
739
740        let chunker = Chunker::new(ChunkStrategy::Dependency, 1000);
741        let chunks = chunker.chunk(&repo);
742
743        // Should handle cycles gracefully
744        assert!(!chunks.is_empty());
745        // All files should be included
746        let total_files: usize = chunks.iter().map(|c| c.files.len()).sum();
747        assert_eq!(total_files, 2);
748    }
749
750    // ============================================
751    // Symbol Chunking Edge Cases
752    // ============================================
753
754    #[test]
755    fn test_symbol_chunking_no_symbols() {
756        let repo = create_test_repo(); // No symbols
757        let chunker = Chunker::new(ChunkStrategy::Symbol, 500);
758        let chunks = chunker.chunk(&repo);
759
760        // Should fall back to semantic chunking
761        assert!(!chunks.is_empty());
762    }
763
764    #[test]
765    fn test_symbol_chunking_with_imports() {
766        let mut repo = create_test_repo();
767        // Add imports (should be skipped)
768        if let Some(file) = repo.files.get_mut(0) {
769            file.symbols.push(Symbol::new("os", SymbolKind::Import));
770            file.symbols.push(Symbol::new("sys", SymbolKind::Import));
771            let mut func = Symbol::new("func0", SymbolKind::Function);
772            func.start_line = 3;
773            func.end_line = 5;
774            file.symbols.push(func);
775        }
776
777        let chunker = Chunker::new(ChunkStrategy::Symbol, 1000);
778        let chunks = chunker.chunk(&repo);
779
780        assert!(!chunks.is_empty());
781    }
782
783    #[test]
784    fn test_symbol_chunking_multiple_symbols_per_file() {
785        let mut repo = Repository::new("test", "/tmp/test");
786        let mut file = RepoFile {
787            path: "/tmp/test/main.py".into(),
788            relative_path: "main.py".to_string(),
789            language: Some("python".to_string()),
790            size_bytes: 1000,
791            token_count: TokenCounts::default_with_value(500),
792            symbols: Vec::new(),
793            importance: 0.8,
794            content: Some("def func1(): pass\ndef func2(): pass\ndef func3(): pass".to_string()),
795        };
796
797        for i in 1..=3 {
798            let mut sym = Symbol::new(&format!("func{}", i), SymbolKind::Function);
799            sym.start_line = i;
800            sym.end_line = i;
801            sym.importance = 0.9 - (i as f32 * 0.1);
802            file.symbols.push(sym);
803        }
804        repo.files.push(file);
805
806        let chunker = Chunker::new(ChunkStrategy::Symbol, 2000);
807        let chunks = chunker.chunk(&repo);
808
809        assert!(!chunks.is_empty());
810    }
811
812    // ============================================
813    // Chunker Builder Tests
814    // ============================================
815
816    #[test]
817    fn test_chunker_with_overlap() {
818        let chunker = Chunker::new(ChunkStrategy::Fixed { size: 1000 }, 1000).with_overlap(500);
819        assert_eq!(chunker.overlap_tokens, 500);
820    }
821
822    #[test]
823    fn test_chunker_with_model() {
824        let chunker = Chunker::new(ChunkStrategy::Fixed { size: 1000 }, 1000)
825            .with_model(TokenizerModel::Gpt4o);
826        assert_eq!(chunker.model, TokenizerModel::Gpt4o);
827    }
828
829    #[test]
830    fn test_chunker_builder_chain() {
831        let chunker = Chunker::new(ChunkStrategy::Semantic, 2000)
832            .with_overlap(300)
833            .with_model(TokenizerModel::Gemini);
834
835        assert_eq!(chunker.overlap_tokens, 300);
836        assert_eq!(chunker.model, TokenizerModel::Gemini);
837        assert!(matches!(chunker.strategy, ChunkStrategy::Semantic));
838    }
839
840    // ============================================
841    // Focus Determination Tests
842    // ============================================
843
844    #[test]
845    fn test_determine_focus_empty() {
846        let chunker = Chunker::new(ChunkStrategy::File, 1000);
847        let files: Vec<RepoFile> = vec![];
848        let focus = chunker.determine_focus(&files);
849        assert_eq!(focus, "Empty");
850    }
851
852    #[test]
853    fn test_determine_focus_common_module() {
854        let repo = create_multi_module_repo();
855        let chunker = Chunker::new(ChunkStrategy::File, 1000);
856        // Get just moduleA files
857        let module_a_files: Vec<RepoFile> = repo
858            .files
859            .iter()
860            .filter(|f| f.relative_path.starts_with("moduleA"))
861            .cloned()
862            .collect();
863
864        let focus = chunker.determine_focus(&module_a_files);
865        assert!(focus.contains("moduleA"));
866    }
867
868    #[test]
869    fn test_determine_focus_common_language() {
870        let mut repo = Repository::new("test", "/tmp/test");
871        for i in 0..3 {
872            repo.files.push(RepoFile {
873                path: format!("/tmp/test/dir{}/file.rs", i).into(),
874                relative_path: format!("dir{}/file.rs", i),
875                language: Some("rust".to_string()),
876                size_bytes: 500,
877                token_count: TokenCounts::default_with_value(200),
878                symbols: Vec::new(),
879                importance: 0.5,
880                content: Some("fn main() {}".to_string()),
881            });
882        }
883
884        let chunker = Chunker::new(ChunkStrategy::File, 1000);
885        let focus = chunker.determine_focus(&repo.files);
886        assert!(focus.contains("rust") || focus.contains("Mixed"));
887    }
888
889    // ============================================
890    // Chunk Context Tests
891    // ============================================
892
893    #[test]
894    fn test_chunk_context_previous_summary() {
895        let repo = create_test_repo();
896        let chunker = Chunker::new(ChunkStrategy::Fixed { size: 600 }, 600);
897        let chunks = chunker.chunk(&repo);
898
899        if chunks.len() > 1 {
900            // First chunk has no previous
901            assert!(chunks[0].context.previous_summary.is_none());
902            // Second chunk should have previous
903            assert!(chunks[1].context.previous_summary.is_some());
904        }
905    }
906
907    #[test]
908    fn test_chunk_context_next_preview() {
909        let repo = create_test_repo();
910        let chunker = Chunker::new(ChunkStrategy::Fixed { size: 600 }, 600);
911        let chunks = chunker.chunk(&repo);
912
913        if chunks.len() > 1 {
914            // First chunk should have next preview
915            assert!(chunks[0].context.next_preview.is_some());
916            // Last chunk has no next
917            assert!(chunks.last().unwrap().context.next_preview.is_none());
918        }
919    }
920
921    // ============================================
922    // Overlap Content Tests
923    // ============================================
924
925    #[test]
926    fn test_extract_overlap_content() {
927        let repo = create_test_repo();
928        let chunker = Chunker::new(ChunkStrategy::Fixed { size: 600 }, 600).with_overlap(100);
929        let chunks = chunker.chunk(&repo);
930
931        // If there are multiple chunks, later ones should have overlap
932        if chunks.len() > 1 {
933            // Overlap is added during finalization
934            // Just verify chunking completes without error
935            assert!(chunks.iter().all(|c| c.total == chunks.len()));
936        }
937    }
938
939    #[test]
940    fn test_no_overlap_when_zero() {
941        let repo = create_test_repo();
942        let chunker = Chunker::new(ChunkStrategy::Fixed { size: 600 }, 600).with_overlap(0);
943        let chunks = chunker.chunk(&repo);
944
945        // No overlap content should be added
946        for chunk in &chunks {
947            assert!(chunk.context.overlap_content.is_none());
948        }
949    }
950
951    // ============================================
952    // Cross Reference Tests
953    // ============================================
954
955    #[test]
956    fn test_cross_references_populated() {
957        let repo = create_repo_with_imports();
958        let chunker = Chunker::new(ChunkStrategy::File, 1000);
959        let chunks = chunker.chunk(&repo);
960
961        // Cross references should be populated during finalization
962        assert!(!chunks.is_empty());
963    }
964
965    // ============================================
966    // Empty Repository Tests
967    // ============================================
968
969    #[test]
970    fn test_fixed_chunking_empty_repo() {
971        let repo = Repository::new("empty", "/tmp/empty");
972        let chunker = Chunker::new(ChunkStrategy::Fixed { size: 1000 }, 1000);
973        let chunks = chunker.chunk(&repo);
974        assert!(chunks.is_empty());
975    }
976
977    #[test]
978    fn test_module_chunking_empty_repo() {
979        let repo = Repository::new("empty", "/tmp/empty");
980        let chunker = Chunker::new(ChunkStrategy::Module, 1000);
981        let chunks = chunker.chunk(&repo);
982        assert!(chunks.is_empty());
983    }
984
985    #[test]
986    fn test_dependency_chunking_empty_repo() {
987        let repo = Repository::new("empty", "/tmp/empty");
988        let chunker = Chunker::new(ChunkStrategy::Dependency, 1000);
989        let chunks = chunker.chunk(&repo);
990        assert!(chunks.is_empty());
991    }
992
993    // ============================================
994    // Large File Tests
995    // ============================================
996
997    #[test]
998    fn test_fixed_chunking_single_large_file() {
999        let mut repo = Repository::new("test", "/tmp/test");
1000        repo.files.push(RepoFile {
1001            path: "/tmp/test/large.py".into(),
1002            relative_path: "large.py".to_string(),
1003            language: Some("python".to_string()),
1004            size_bytes: 50000,
1005            token_count: TokenCounts::default_with_value(10000),
1006            symbols: Vec::new(),
1007            importance: 0.5,
1008            content: Some("x = 1\n".repeat(1000)),
1009        });
1010
1011        let chunker = Chunker::new(ChunkStrategy::Fixed { size: 500 }, 500);
1012        let chunks = chunker.chunk(&repo);
1013
1014        // Large file should be in its own chunk
1015        assert!(!chunks.is_empty());
1016    }
1017
1018    // ============================================
1019    // Chunk Total Count Tests
1020    // ============================================
1021
1022    #[test]
1023    fn test_chunk_total_is_correct() {
1024        let repo = create_test_repo();
1025        let chunker = Chunker::new(ChunkStrategy::Fixed { size: 600 }, 600);
1026        let chunks = chunker.chunk(&repo);
1027
1028        let expected_total = chunks.len();
1029        for chunk in &chunks {
1030            assert_eq!(chunk.total, expected_total);
1031        }
1032    }
1033
1034    #[test]
1035    fn test_chunk_index_is_sequential() {
1036        let repo = create_test_repo();
1037        let chunker = Chunker::new(ChunkStrategy::Fixed { size: 600 }, 600);
1038        let chunks = chunker.chunk(&repo);
1039
1040        for (i, chunk) in chunks.iter().enumerate() {
1041            assert_eq!(chunk.index, i);
1042        }
1043    }
1044}