infiniloom_engine/chunking/
mod.rs

1//! Intelligent code chunking for LLM context windows
2
3use crate::types::{RepoFile, Repository, SymbolKind, TokenizerModel};
4use serde::Serialize;
5
6/// A chunk of repository content
7#[derive(Debug, Clone, Serialize)]
8pub struct Chunk {
9    /// Chunk index (0-based)
10    pub index: usize,
11    /// Total number of chunks
12    pub total: usize,
13    /// Focus/theme of this chunk
14    pub focus: String,
15    /// Token count for this chunk
16    pub tokens: u32,
17    /// Files included in this chunk
18    pub files: Vec<ChunkFile>,
19    /// Context information
20    pub context: ChunkContext,
21}
22
23/// A file within a chunk
24#[derive(Debug, Clone, Serialize)]
25pub struct ChunkFile {
26    /// Relative file path
27    pub path: String,
28    /// File content (may be compressed)
29    pub content: String,
30    /// Token count
31    pub tokens: u32,
32    /// Whether content is truncated
33    pub truncated: bool,
34}
35
36/// Context for chunk continuity
37#[derive(Debug, Clone, Serialize)]
38pub struct ChunkContext {
39    /// Summary of previous chunks
40    pub previous_summary: Option<String>,
41    /// Current focus description
42    pub current_focus: String,
43    /// Preview of next chunk
44    pub next_preview: Option<String>,
45    /// Cross-references to other chunks
46    pub cross_references: Vec<CrossReference>,
47    /// Overlap content from previous chunk (for context continuity)
48    pub overlap_content: Option<String>,
49}
50
51/// Reference to symbol in another chunk
52#[derive(Debug, Clone, Serialize)]
53pub struct CrossReference {
54    /// Symbol name
55    pub symbol: String,
56    /// Chunk containing the symbol
57    pub chunk_index: usize,
58    /// File containing the symbol
59    pub file: String,
60}
61
62#[derive(Debug, Clone)]
63struct SymbolSnippet {
64    file_path: String,
65    symbol_name: String,
66    start_line: u32,
67    content: String,
68    tokens: u32,
69    importance: f32,
70}
71
72/// Chunking strategy
73#[derive(Debug, Clone, Copy, Default)]
74pub enum ChunkStrategy {
75    /// Fixed token size chunks
76    Fixed {
77        /// Maximum tokens per chunk
78        size: u32,
79    },
80    /// One file per chunk
81    File,
82    /// Group by module/directory
83    Module,
84    /// Group by symbols (AST-based)
85    Symbol,
86    /// Group by semantic similarity
87    #[default]
88    Semantic,
89    /// Group by dependency order
90    Dependency,
91}
92
93/// Chunker for splitting repositories
94pub struct Chunker {
95    /// Chunking strategy
96    strategy: ChunkStrategy,
97    /// Maximum tokens per chunk
98    max_tokens: u32,
99    /// Overlap tokens between chunks
100    overlap_tokens: u32,
101    /// Target model for token counting
102    model: TokenizerModel,
103}
104
105impl Chunker {
106    /// Create a new chunker
107    pub fn new(strategy: ChunkStrategy, max_tokens: u32) -> Self {
108        Self { strategy, max_tokens, overlap_tokens: 200, model: TokenizerModel::Claude }
109    }
110
111    /// Set overlap tokens
112    pub fn with_overlap(mut self, tokens: u32) -> Self {
113        self.overlap_tokens = tokens;
114        self
115    }
116
117    /// Set target model
118    pub fn with_model(mut self, model: TokenizerModel) -> Self {
119        self.model = model;
120        self
121    }
122
123    /// Chunk a repository
124    pub fn chunk(&self, repo: &Repository) -> Vec<Chunk> {
125        match self.strategy {
126            ChunkStrategy::Fixed { size } => self.fixed_chunk(repo, size),
127            ChunkStrategy::File => self.file_chunk(repo),
128            ChunkStrategy::Module => self.module_chunk(repo),
129            ChunkStrategy::Symbol => self.symbol_chunk(repo),
130            ChunkStrategy::Semantic => self.semantic_chunk(repo),
131            ChunkStrategy::Dependency => self.dependency_chunk(repo),
132        }
133    }
134
135    /// Fixed-size chunking
136    fn fixed_chunk(&self, repo: &Repository, size: u32) -> Vec<Chunk> {
137        let mut chunks = Vec::new();
138        let mut current_files = Vec::new();
139        let mut current_tokens = 0u32;
140
141        for file in &repo.files {
142            let file_tokens = file.token_count.get(self.model);
143
144            if current_tokens + file_tokens > size && !current_files.is_empty() {
145                chunks.push(self.create_chunk(chunks.len(), &current_files, current_tokens));
146                current_files.clear();
147                current_tokens = 0;
148            }
149
150            current_files.push(file.clone());
151            current_tokens += file_tokens;
152        }
153
154        if !current_files.is_empty() {
155            chunks.push(self.create_chunk(chunks.len(), &current_files, current_tokens));
156        }
157
158        self.finalize_chunks(chunks, repo)
159    }
160
161    /// One file per chunk
162    fn file_chunk(&self, repo: &Repository) -> Vec<Chunk> {
163        let chunks: Vec<_> = repo
164            .files
165            .iter()
166            .enumerate()
167            .map(|(i, file)| {
168                self.create_chunk(i, std::slice::from_ref(file), file.token_count.get(self.model))
169            })
170            .collect();
171
172        self.finalize_chunks(chunks, repo)
173    }
174
175    /// Group by module/directory, respecting max_tokens limit
176    fn module_chunk(&self, repo: &Repository) -> Vec<Chunk> {
177        use std::collections::HashMap;
178
179        let mut modules: HashMap<String, Vec<RepoFile>> = HashMap::new();
180
181        for file in &repo.files {
182            let module = file
183                .relative_path
184                .split('/')
185                .next()
186                .unwrap_or("root")
187                .to_owned();
188
189            modules.entry(module).or_default().push(file.clone());
190        }
191
192        // Sort modules for consistent ordering
193        let mut sorted_modules: Vec<_> = modules.into_iter().collect();
194        sorted_modules.sort_by(|a, b| a.0.cmp(&b.0));
195
196        let mut chunks = Vec::new();
197
198        for (_module_name, mut files) in sorted_modules {
199            // Sort files within module by path
200            files.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
201
202            let module_tokens: u32 = files.iter().map(|f| f.token_count.get(self.model)).sum();
203
204            if module_tokens <= self.max_tokens {
205                // Module fits in one chunk
206                chunks.push(self.create_chunk(chunks.len(), &files, module_tokens));
207            } else {
208                // Module exceeds max_tokens - split it into multiple chunks
209                let mut current_files = Vec::new();
210                let mut current_tokens = 0u32;
211
212                for file in files {
213                    let file_tokens = file.token_count.get(self.model);
214
215                    // If adding this file would exceed limit and we have files, create chunk
216                    if current_tokens + file_tokens > self.max_tokens && !current_files.is_empty() {
217                        chunks.push(self.create_chunk(
218                            chunks.len(),
219                            &current_files,
220                            current_tokens,
221                        ));
222                        current_files = Vec::new();
223                        current_tokens = 0;
224                    }
225
226                    // Add file to current chunk (even if it alone exceeds max_tokens)
227                    current_files.push(file);
228                    current_tokens += file_tokens;
229                }
230
231                // Don't forget remaining files
232                if !current_files.is_empty() {
233                    chunks.push(self.create_chunk(chunks.len(), &current_files, current_tokens));
234                }
235            }
236        }
237
238        self.finalize_chunks(chunks, repo)
239    }
240
241    /// Symbol-based chunking - groups by key symbols with small context
242    fn symbol_chunk(&self, repo: &Repository) -> Vec<Chunk> {
243        use crate::tokenizer::Tokenizer;
244
245        const CONTEXT_LINES: u32 = 2;
246        let tokenizer = Tokenizer::new();
247        let mut snippets: Vec<SymbolSnippet> = Vec::new();
248
249        for file in &repo.files {
250            let content = match &file.content {
251                Some(content) => content,
252                None => continue,
253            };
254
255            let lines: Vec<&str> = content.lines().collect();
256            let total_lines = lines.len() as u32;
257            if total_lines == 0 {
258                continue;
259            }
260
261            for symbol in &file.symbols {
262                if symbol.kind == SymbolKind::Import {
263                    continue;
264                }
265
266                let snippet_content = if symbol.start_line > 0
267                    && symbol.end_line >= symbol.start_line
268                    && symbol.start_line <= total_lines
269                {
270                    let start = symbol.start_line.saturating_sub(CONTEXT_LINES).max(1);
271                    let end = symbol
272                        .end_line
273                        .max(symbol.start_line)
274                        .saturating_add(CONTEXT_LINES)
275                        .min(total_lines);
276                    let start_idx = start.saturating_sub(1) as usize;
277                    let end_idx = end.saturating_sub(1) as usize;
278                    if start_idx > end_idx || end_idx >= lines.len() {
279                        continue;
280                    }
281
282                    let mut snippet = String::new();
283                    snippet.push_str(&format!(
284                        "// {}: {} (lines {}-{})\n",
285                        symbol.kind.name(),
286                        symbol.name,
287                        start,
288                        end
289                    ));
290                    snippet.push_str(&lines[start_idx..=end_idx].join("\n"));
291                    snippet
292                } else if let Some(ref sig) = symbol.signature {
293                    format!("// {}: {}\n{}", symbol.kind.name(), symbol.name, sig.trim())
294                } else {
295                    continue;
296                };
297
298                let tokens = tokenizer.count(&snippet_content, self.model);
299                let importance = (symbol.importance * 0.7) + (file.importance * 0.3);
300
301                snippets.push(SymbolSnippet {
302                    file_path: file.relative_path.clone(),
303                    symbol_name: symbol.name.clone(),
304                    start_line: symbol.start_line,
305                    content: snippet_content,
306                    tokens,
307                    importance,
308                });
309            }
310        }
311
312        if snippets.is_empty() {
313            return self.semantic_chunk(repo);
314        }
315
316        snippets.sort_by(|a, b| {
317            b.importance
318                .partial_cmp(&a.importance)
319                .unwrap_or(std::cmp::Ordering::Equal)
320                .then_with(|| a.tokens.cmp(&b.tokens))
321                .then_with(|| a.file_path.cmp(&b.file_path))
322        });
323
324        let mut chunks: Vec<Chunk> = Vec::new();
325        let mut current: Vec<SymbolSnippet> = Vec::new();
326        let mut current_tokens = 0u32;
327
328        for snippet in snippets {
329            if current_tokens + snippet.tokens > self.max_tokens && !current.is_empty() {
330                chunks.push(self.build_symbol_chunk(chunks.len(), &current, &tokenizer));
331                current.clear();
332                current_tokens = 0;
333            }
334
335            current_tokens += snippet.tokens;
336            current.push(snippet);
337        }
338
339        if !current.is_empty() {
340            chunks.push(self.build_symbol_chunk(chunks.len(), &current, &tokenizer));
341        }
342
343        self.finalize_chunks(chunks, repo)
344    }
345
346    /// Semantic chunking (group related files)
347    fn semantic_chunk(&self, repo: &Repository) -> Vec<Chunk> {
348        let mut chunks = Vec::new();
349        let mut current_files = Vec::new();
350        let mut current_tokens = 0u32;
351        let mut current_module: Option<String> = None;
352
353        // Sort files by path for better grouping
354        let mut sorted_files: Vec<_> = repo.files.iter().collect();
355        sorted_files.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
356
357        for file in sorted_files {
358            let file_tokens = file.token_count.get(self.model);
359            let file_module = file.relative_path.split('/').next().map(String::from);
360
361            // Check if we should start a new chunk
362            let should_split = current_tokens + file_tokens > self.max_tokens
363                || (current_module.is_some()
364                    && file_module.is_some()
365                    && current_module != file_module
366                    && current_tokens > self.max_tokens / 2);
367
368            if should_split && !current_files.is_empty() {
369                chunks.push(self.create_chunk(chunks.len(), &current_files, current_tokens));
370
371                // Keep some overlap for context
372                current_files = self.get_overlap_files(&current_files);
373                current_tokens = current_files
374                    .iter()
375                    .map(|f| f.token_count.get(self.model))
376                    .sum();
377            }
378
379            current_files.push(file.clone());
380            current_tokens += file_tokens;
381            current_module = file_module;
382        }
383
384        if !current_files.is_empty() {
385            chunks.push(self.create_chunk(chunks.len(), &current_files, current_tokens));
386        }
387
388        self.finalize_chunks(chunks, repo)
389    }
390
391    /// Dependency-based chunking - groups files by their import dependencies
392    /// Files are ordered so that dependencies appear before dependents
393    fn dependency_chunk(&self, repo: &Repository) -> Vec<Chunk> {
394        use std::collections::{HashMap, HashSet, VecDeque};
395
396        // Build a map of file path to index
397        let file_indices: HashMap<&str, usize> = repo
398            .files
399            .iter()
400            .enumerate()
401            .map(|(i, f)| (f.relative_path.as_str(), i))
402            .collect();
403
404        // Build dependency graph: file_idx -> set of dependent file indices
405        // Also track reverse: file_idx -> set of files it imports from
406        let mut imports_from: Vec<HashSet<usize>> = vec![HashSet::new(); repo.files.len()];
407        let mut imported_by: Vec<HashSet<usize>> = vec![HashSet::new(); repo.files.len()];
408
409        for (idx, file) in repo.files.iter().enumerate() {
410            // Look at symbols to find imports
411            for symbol in &file.symbols {
412                if symbol.kind == SymbolKind::Import {
413                    // Try to resolve the import to a file in the repo
414                    let import_name = &symbol.name;
415
416                    // Check various path patterns
417                    let potential_paths = Self::resolve_import_paths(import_name, file);
418
419                    for potential in potential_paths {
420                        if let Some(&target_idx) = file_indices.get(potential.as_str()) {
421                            if target_idx != idx {
422                                imports_from[idx].insert(target_idx);
423                                imported_by[target_idx].insert(idx);
424                            }
425                        }
426                    }
427                }
428            }
429        }
430
431        // Topological sort using Kahn's algorithm
432        let mut in_degree: Vec<usize> = imports_from.iter().map(|deps| deps.len()).collect();
433        let mut queue: VecDeque<usize> = in_degree
434            .iter()
435            .enumerate()
436            .filter_map(|(i, &d)| if d == 0 { Some(i) } else { None })
437            .collect();
438
439        let mut sorted_indices: Vec<usize> = Vec::with_capacity(repo.files.len());
440        let mut sorted_set: HashSet<usize> = HashSet::with_capacity(repo.files.len());
441
442        while let Some(idx) = queue.pop_front() {
443            sorted_indices.push(idx);
444            sorted_set.insert(idx);
445            for &dependent in &imported_by[idx] {
446                in_degree[dependent] -= 1;
447                if in_degree[dependent] == 0 {
448                    queue.push_back(dependent);
449                }
450            }
451        }
452
453        // Handle any cycles by adding remaining files (files in cycles)
454        // Using HashSet for O(1) lookups instead of O(n) Vec::contains
455        if sorted_indices.len() < repo.files.len() {
456            for idx in 0..repo.files.len() {
457                if !sorted_set.contains(&idx) {
458                    sorted_indices.push(idx);
459                }
460            }
461        }
462
463        // Now chunk the sorted files, trying to keep related files together
464        let mut chunks = Vec::new();
465        let mut current_files = Vec::new();
466        let mut current_tokens = 0u32;
467        let mut current_deps: HashSet<usize> = HashSet::new();
468
469        for &idx in &sorted_indices {
470            let file = &repo.files[idx];
471            let file_tokens = file.token_count.get(self.model);
472
473            // Check if this file depends on files in the current chunk
474            let depends_on_current = imports_from[idx].iter().any(|d| current_deps.contains(d));
475
476            // Should we start a new chunk?
477            let should_split = current_tokens + file_tokens > self.max_tokens
478                && !current_files.is_empty()
479                && !depends_on_current; // Try to keep dependent files together
480
481            if should_split {
482                chunks.push(self.create_chunk(chunks.len(), &current_files, current_tokens));
483                current_files.clear();
484                current_tokens = 0;
485                current_deps.clear();
486            }
487
488            current_files.push(file.clone());
489            current_tokens += file_tokens;
490            current_deps.insert(idx);
491        }
492
493        if !current_files.is_empty() {
494            chunks.push(self.create_chunk(chunks.len(), &current_files, current_tokens));
495        }
496
497        self.finalize_chunks(chunks, repo)
498    }
499
500    /// Resolve an import name to potential file paths
501    fn resolve_import_paths(import_name: &str, source_file: &RepoFile) -> Vec<String> {
502        let mut paths = Vec::new();
503        let source_dir = source_file
504            .relative_path
505            .rsplit_once('/')
506            .map(|(d, _)| d)
507            .unwrap_or("");
508
509        // Convert import to potential paths (handles various languages)
510        let normalized = import_name.replace("::", "/").replace(['.', '\\'], "/");
511
512        // Try with common extensions
513        let extensions = ["py", "js", "ts", "tsx", "jsx", "rs", "go", "java", "rb"];
514        for ext in extensions {
515            // Absolute import
516            paths.push(format!("{}.{}", normalized, ext));
517            paths.push(format!("{}/index.{}", normalized, ext));
518            paths.push(format!("{}/mod.{}", normalized, ext));
519
520            // Relative to source file
521            if !source_dir.is_empty() {
522                paths.push(format!("{}/{}.{}", source_dir, normalized, ext));
523            }
524        }
525
526        // Also try the exact path if it looks like a file
527        if import_name.contains('/') || import_name.contains('.') {
528            paths.push(import_name.to_owned());
529        }
530
531        paths
532    }
533
534    fn create_chunk(&self, index: usize, files: &[RepoFile], tokens: u32) -> Chunk {
535        let focus = self.determine_focus(files);
536
537        Chunk {
538            index,
539            total: 0, // Updated in finalize
540            focus: focus.clone(),
541            tokens,
542            files: files
543                .iter()
544                .map(|f| ChunkFile {
545                    path: f.relative_path.clone(),
546                    content: f.content.clone().unwrap_or_default(),
547                    tokens: f.token_count.get(self.model),
548                    truncated: false,
549                })
550                .collect(),
551            context: ChunkContext {
552                previous_summary: None,
553                current_focus: focus,
554                next_preview: None,
555                cross_references: Vec::new(),
556                overlap_content: None,
557            },
558        }
559    }
560
561    fn build_symbol_chunk(
562        &self,
563        index: usize,
564        snippets: &[SymbolSnippet],
565        tokenizer: &crate::tokenizer::Tokenizer,
566    ) -> Chunk {
567        use std::collections::BTreeMap;
568
569        let focus = self.determine_symbol_focus(snippets);
570        let mut by_file: BTreeMap<&str, Vec<&SymbolSnippet>> = BTreeMap::new();
571
572        for snippet in snippets {
573            by_file
574                .entry(snippet.file_path.as_str())
575                .or_default()
576                .push(snippet);
577        }
578
579        let mut files = Vec::new();
580        let mut total_tokens = 0u32;
581
582        for (path, mut entries) in by_file {
583            entries.sort_by(|a, b| {
584                a.start_line
585                    .cmp(&b.start_line)
586                    .then_with(|| a.symbol_name.cmp(&b.symbol_name))
587            });
588
589            let mut content = String::new();
590            for entry in entries {
591                if !content.is_empty() {
592                    content.push_str("\n\n");
593                }
594                content.push_str(&entry.content);
595            }
596
597            let tokens = tokenizer.count(&content, self.model);
598            total_tokens += tokens;
599
600            files.push(ChunkFile { path: path.to_owned(), content, tokens, truncated: false });
601        }
602
603        Chunk {
604            index,
605            total: 0,
606            focus: focus.clone(),
607            tokens: total_tokens,
608            files,
609            context: ChunkContext {
610                previous_summary: None,
611                current_focus: focus,
612                next_preview: None,
613                cross_references: Vec::new(),
614                overlap_content: None,
615            },
616        }
617    }
618
619    fn determine_focus(&self, files: &[RepoFile]) -> String {
620        if files.is_empty() {
621            return "Empty".to_owned();
622        }
623
624        // Try to find common directory
625        let first_path = &files[0].relative_path;
626        if let Some(module) = first_path.split('/').next() {
627            if files.iter().all(|f| f.relative_path.starts_with(module)) {
628                return format!("{} module", module);
629            }
630        }
631
632        // Try to find common language
633        if let Some(lang) = &files[0].language {
634            if files.iter().all(|f| f.language.as_ref() == Some(lang)) {
635                return format!("{} files", lang);
636            }
637        }
638
639        "Mixed content".to_owned()
640    }
641
642    fn determine_symbol_focus(&self, snippets: &[SymbolSnippet]) -> String {
643        if snippets.is_empty() {
644            return "Symbols".to_owned();
645        }
646
647        let mut names: Vec<String> = snippets
648            .iter()
649            .take(3)
650            .map(|snippet| snippet.symbol_name.clone())
651            .collect();
652
653        let suffix = if snippets.len() > names.len() {
654            format!(" +{} more", snippets.len() - names.len())
655        } else {
656            String::new()
657        };
658
659        if names.len() == 1 {
660            format!("Symbol: {}{}", names.remove(0), suffix)
661        } else {
662            format!("Symbols: {}{}", names.join(", "), suffix)
663        }
664    }
665
666    fn get_overlap_files(&self, files: &[RepoFile]) -> Vec<RepoFile> {
667        // Keep files that might be needed for context
668        // For now, just keep the last file if it's small enough
669        files
670            .last()
671            .filter(|f| f.token_count.get(self.model) < self.overlap_tokens)
672            .cloned()
673            .into_iter()
674            .collect()
675    }
676
677    fn finalize_chunks(&self, mut chunks: Vec<Chunk>, repo: &Repository) -> Vec<Chunk> {
678        let total = chunks.len();
679
680        // First pass: collect the focus strings and overlap content we need
681        let focus_strs: Vec<String> = chunks.iter().map(|c| c.focus.clone()).collect();
682
683        // Extract overlap content from each chunk for the next one
684        let overlap_contents: Vec<Option<String>> = if self.overlap_tokens > 0 {
685            chunks
686                .iter()
687                .map(|chunk| self.extract_overlap_content(chunk))
688                .collect()
689        } else {
690            vec![None; chunks.len()]
691        };
692
693        for (i, chunk) in chunks.iter_mut().enumerate() {
694            chunk.total = total;
695
696            // Add previous summary
697            if i > 0 {
698                chunk.context.previous_summary = Some(format!("Previous: {}", focus_strs[i - 1]));
699
700                // Add overlap content from previous chunk
701                if let Some(ref overlap) = overlap_contents[i - 1] {
702                    chunk.context.overlap_content = Some(format!(
703                        "<!-- [OVERLAP FROM PREVIOUS CHUNK] -->\n{}\n<!-- [END OVERLAP] -->",
704                        overlap
705                    ));
706                }
707            }
708
709            // Add next preview
710            if i + 1 < total {
711                chunk.context.next_preview = Some(format!("Next: Chunk {}", i + 2));
712            }
713        }
714
715        self.populate_cross_references(&mut chunks, repo);
716
717        chunks
718    }
719
720    fn populate_cross_references(&self, chunks: &mut [Chunk], repo: &Repository) {
721        use std::collections::{HashMap, HashSet};
722
723        const MAX_REFS: usize = 25;
724
725        #[derive(Clone)]
726        struct SymbolLocation {
727            chunk_index: usize,
728            file: String,
729        }
730
731        let file_lookup: HashMap<&str, &RepoFile> = repo
732            .files
733            .iter()
734            .map(|file| (file.relative_path.as_str(), file))
735            .collect();
736
737        let mut symbol_index: HashMap<String, Vec<SymbolLocation>> = HashMap::new();
738        let mut seen_symbols: HashSet<(String, usize, String)> = HashSet::new();
739
740        for (chunk_index, chunk) in chunks.iter().enumerate() {
741            for chunk_file in &chunk.files {
742                if let Some(repo_file) = file_lookup.get(chunk_file.path.as_str()) {
743                    for symbol in &repo_file.symbols {
744                        if symbol.kind == SymbolKind::Import {
745                            continue;
746                        }
747                        let key = (symbol.name.clone(), chunk_index, chunk_file.path.clone());
748                        if seen_symbols.insert(key) {
749                            symbol_index.entry(symbol.name.clone()).or_default().push(
750                                SymbolLocation { chunk_index, file: chunk_file.path.clone() },
751                            );
752                        }
753                    }
754                }
755            }
756        }
757
758        for (chunk_index, chunk) in chunks.iter_mut().enumerate() {
759            let mut refs: Vec<CrossReference> = Vec::new();
760            let mut seen_refs: HashSet<(String, usize, String)> = HashSet::new();
761
762            'files: for chunk_file in &chunk.files {
763                if let Some(repo_file) = file_lookup.get(chunk_file.path.as_str()) {
764                    for symbol in &repo_file.symbols {
765                        for called in &symbol.calls {
766                            if let Some(targets) = symbol_index.get(called) {
767                                for target in targets {
768                                    if target.chunk_index == chunk_index {
769                                        continue;
770                                    }
771                                    let key = (
772                                        called.to_owned(),
773                                        target.chunk_index,
774                                        target.file.clone(),
775                                    );
776                                    if seen_refs.insert(key) {
777                                        refs.push(CrossReference {
778                                            symbol: called.to_owned(),
779                                            chunk_index: target.chunk_index,
780                                            file: target.file.clone(),
781                                        });
782                                        if refs.len() >= MAX_REFS {
783                                            break 'files;
784                                        }
785                                    }
786                                }
787                            }
788                        }
789
790                        if let Some(ref base) = symbol.extends {
791                            if let Some(targets) = symbol_index.get(base) {
792                                for target in targets {
793                                    if target.chunk_index == chunk_index {
794                                        continue;
795                                    }
796                                    let key =
797                                        (base.to_owned(), target.chunk_index, target.file.clone());
798                                    if seen_refs.insert(key) {
799                                        refs.push(CrossReference {
800                                            symbol: base.to_owned(),
801                                            chunk_index: target.chunk_index,
802                                            file: target.file.clone(),
803                                        });
804                                        if refs.len() >= MAX_REFS {
805                                            break 'files;
806                                        }
807                                    }
808                                }
809                            }
810                        }
811
812                        for iface in &symbol.implements {
813                            if let Some(targets) = symbol_index.get(iface) {
814                                for target in targets {
815                                    if target.chunk_index == chunk_index {
816                                        continue;
817                                    }
818                                    let key =
819                                        (iface.to_owned(), target.chunk_index, target.file.clone());
820                                    if seen_refs.insert(key) {
821                                        refs.push(CrossReference {
822                                            symbol: iface.to_owned(),
823                                            chunk_index: target.chunk_index,
824                                            file: target.file.clone(),
825                                        });
826                                        if refs.len() >= MAX_REFS {
827                                            break 'files;
828                                        }
829                                    }
830                                }
831                            }
832                        }
833                    }
834                }
835            }
836
837            refs.sort_by(|a, b| {
838                a.chunk_index
839                    .cmp(&b.chunk_index)
840                    .then_with(|| a.symbol.cmp(&b.symbol))
841                    .then_with(|| a.file.cmp(&b.file))
842            });
843            if refs.len() > MAX_REFS {
844                refs.truncate(MAX_REFS);
845            }
846
847            chunk.context.cross_references = refs;
848        }
849    }
850
851    /// Extract content from the end of a chunk for overlap
852    fn extract_overlap_content(&self, chunk: &Chunk) -> Option<String> {
853        use crate::tokenizer::Tokenizer;
854
855        if self.overlap_tokens == 0 || chunk.files.is_empty() {
856            return None;
857        }
858
859        let tokenizer = Tokenizer::new();
860        let mut overlap_parts = Vec::new();
861        let mut remaining_tokens = self.overlap_tokens;
862        let token_model = self.model;
863
864        // Take content from the last files until we've accumulated enough tokens
865        for file in chunk.files.iter().rev() {
866            if remaining_tokens == 0 {
867                break;
868            }
869
870            let file_tokens = tokenizer.count(&file.content, token_model);
871            if file_tokens <= remaining_tokens {
872                // Include entire file
873                overlap_parts.push(format!("// From: {}\n{}", file.path, file.content));
874                remaining_tokens = remaining_tokens.saturating_sub(file_tokens);
875            } else {
876                // Include partial file (last N lines that fit)
877                let lines: Vec<&str> = file.content.lines().collect();
878                let mut partial_lines = Vec::new();
879                let mut partial_tokens = 0u32;
880
881                for line in lines.iter().rev() {
882                    let line_tokens = tokenizer.count(line, token_model);
883                    if partial_tokens + line_tokens > remaining_tokens {
884                        break;
885                    }
886                    partial_lines.push(*line);
887                    partial_tokens += line_tokens;
888                }
889
890                if !partial_lines.is_empty() {
891                    partial_lines.reverse();
892                    let partial_content = partial_lines.join("\n");
893                    overlap_parts
894                        .push(format!("// From: {} (partial)\n{}", file.path, partial_content));
895                }
896                remaining_tokens = 0;
897            }
898        }
899
900        if overlap_parts.is_empty() {
901            None
902        } else {
903            overlap_parts.reverse();
904            Some(overlap_parts.join("\n\n"))
905        }
906    }
907}
908
909#[cfg(test)]
910#[allow(clippy::str_to_string)]
911mod tests {
912    use super::*;
913    use crate::types::{Symbol, SymbolKind, TokenCounts, Visibility};
914
915    fn create_test_repo() -> Repository {
916        let mut repo = Repository::new("test", "/tmp/test");
917
918        for i in 0..5 {
919            repo.files.push(RepoFile {
920                path: format!("/tmp/test/src/file{}.py", i).into(),
921                relative_path: format!("src/file{}.py", i),
922                language: Some("python".to_string()),
923                size_bytes: 1000,
924                token_count: TokenCounts {
925                    o200k: 480,
926                    cl100k: 490,
927                    claude: 500,
928                    gemini: 470,
929                    llama: 460,
930                    mistral: 460,
931                    deepseek: 460,
932                    qwen: 460,
933                    cohere: 465,
934                    grok: 460,
935                },
936                symbols: Vec::new(),
937                importance: 0.5,
938                content: Some(format!("# File {}\ndef func{}(): pass", i, i)),
939            });
940        }
941
942        repo
943    }
944
945    #[test]
946    fn test_fixed_chunking() {
947        let repo = create_test_repo();
948        let chunker = Chunker::new(ChunkStrategy::Fixed { size: 1000 }, 1000);
949        let chunks = chunker.chunk(&repo);
950
951        assert!(!chunks.is_empty());
952        assert!(chunks
953            .iter()
954            .all(|c| c.tokens <= 1000 || c.files.len() == 1));
955    }
956
957    #[test]
958    fn test_file_chunking() {
959        let repo = create_test_repo();
960        let chunker = Chunker::new(ChunkStrategy::File, 8000);
961        let chunks = chunker.chunk(&repo);
962
963        assert_eq!(chunks.len(), repo.files.len());
964    }
965
966    #[test]
967    fn test_semantic_chunking() {
968        let repo = create_test_repo();
969        let chunker = Chunker::new(ChunkStrategy::Semantic, 2000);
970        let chunks = chunker.chunk(&repo);
971
972        assert!(!chunks.is_empty());
973        // All chunks should have correct total
974        assert!(chunks.iter().all(|c| c.total == chunks.len()));
975    }
976
977    #[test]
978    fn test_symbol_chunking() {
979        let mut repo = create_test_repo();
980        if let Some(file) = repo.files.get_mut(0) {
981            let mut symbol = Symbol::new("func0", SymbolKind::Function);
982            symbol.start_line = 1;
983            symbol.end_line = 1;
984            symbol.visibility = Visibility::Public;
985            file.symbols.push(symbol);
986        }
987
988        let chunker = Chunker::new(ChunkStrategy::Symbol, 500);
989        let chunks = chunker.chunk(&repo);
990
991        assert!(!chunks.is_empty());
992        assert!(chunks.iter().all(|c| c.total == chunks.len()));
993    }
994}
infiniloom_engine/chunking/mod.rs

infiniloom_engine/chunking/
mod.rs