infiniloom_engine/chunking/
mod.rs

1//! Intelligent code chunking for LLM context windows
2//!
3//! This module provides various strategies for splitting repositories into
4//! chunks that fit within LLM context windows while preserving semantic coherence.
5
6mod strategies;
7mod types;
8
9pub use types::{Chunk, ChunkContext, ChunkFile, ChunkStrategy, Chunker, CrossReference};
10use types::SymbolSnippet;
11
12use crate::tokenizer::Tokenizer;
13use crate::types::{RepoFile, Repository, SymbolKind, TokenizerModel};
14use std::collections::{BTreeMap, HashMap, HashSet};
15
16impl Chunker {
17    /// Create a new chunker
18    pub fn new(strategy: ChunkStrategy, max_tokens: u32) -> Self {
19        Self { strategy, max_tokens, overlap_tokens: 200, model: TokenizerModel::Claude }
20    }
21
22    /// Set overlap tokens
23    pub fn with_overlap(mut self, tokens: u32) -> Self {
24        self.overlap_tokens = tokens;
25        self
26    }
27
28    /// Set target model
29    pub fn with_model(mut self, model: TokenizerModel) -> Self {
30        self.model = model;
31        self
32    }
33
34    /// Chunk a repository
35    pub fn chunk(&self, repo: &Repository) -> Vec<Chunk> {
36        match self.strategy {
37            ChunkStrategy::Fixed { size } => self.fixed_chunk(repo, size),
38            ChunkStrategy::File => self.file_chunk(repo),
39            ChunkStrategy::Module => self.module_chunk(repo),
40            ChunkStrategy::Symbol => self.symbol_chunk(repo),
41            ChunkStrategy::Semantic => self.semantic_chunk(repo),
42            ChunkStrategy::Dependency => self.dependency_chunk(repo),
43        }
44    }
45
46    // =========================================================================
47    // Chunk creation helpers
48    // =========================================================================
49
50    pub(crate) fn create_chunk(&self, index: usize, files: &[RepoFile], tokens: u32) -> Chunk {
51        let focus = self.determine_focus(files);
52
53        Chunk {
54            index,
55            total: 0, // Updated in finalize
56            focus: focus.clone(),
57            tokens,
58            files: files
59                .iter()
60                .map(|f| ChunkFile {
61                    path: f.relative_path.clone(),
62                    content: f.content.clone().unwrap_or_default(),
63                    tokens: f.token_count.get(self.model),
64                    truncated: false,
65                })
66                .collect(),
67            context: ChunkContext {
68                previous_summary: None,
69                current_focus: focus,
70                next_preview: None,
71                cross_references: Vec::new(),
72                overlap_content: None,
73            },
74        }
75    }
76
77    /// Create a chunk from file references (avoids cloning RepoFile)
78    pub(crate) fn create_chunk_from_refs(&self, index: usize, files: &[&RepoFile], tokens: u32) -> Chunk {
79        let focus = self.determine_focus_refs(files);
80
81        Chunk {
82            index,
83            total: 0, // Updated in finalize
84            focus: focus.clone(),
85            tokens,
86            files: files
87                .iter()
88                .map(|f| ChunkFile {
89                    path: f.relative_path.clone(),
90                    content: f.content.clone().unwrap_or_default(),
91                    tokens: f.token_count.get(self.model),
92                    truncated: false,
93                })
94                .collect(),
95            context: ChunkContext {
96                previous_summary: None,
97                current_focus: focus,
98                next_preview: None,
99                cross_references: Vec::new(),
100                overlap_content: None,
101            },
102        }
103    }
104
105    pub(crate) fn build_symbol_chunk(
106        &self,
107        index: usize,
108        snippets: &[SymbolSnippet],
109        tokenizer: &Tokenizer,
110    ) -> Chunk {
111        let focus = self.determine_symbol_focus(snippets);
112        let mut by_file: BTreeMap<&str, Vec<&SymbolSnippet>> = BTreeMap::new();
113
114        for snippet in snippets {
115            by_file
116                .entry(snippet.file_path.as_str())
117                .or_default()
118                .push(snippet);
119        }
120
121        let mut files = Vec::new();
122        let mut total_tokens = 0u32;
123
124        for (path, mut entries) in by_file {
125            entries.sort_by(|a, b| {
126                a.start_line
127                    .cmp(&b.start_line)
128                    .then_with(|| a.symbol_name.cmp(&b.symbol_name))
129            });
130
131            let mut content = String::new();
132            for entry in entries {
133                if !content.is_empty() {
134                    content.push_str("\n\n");
135                }
136                content.push_str(&entry.content);
137            }
138
139            let tokens = tokenizer.count(&content, self.model);
140            total_tokens += tokens;
141
142            files.push(ChunkFile { path: path.to_owned(), content, tokens, truncated: false });
143        }
144
145        Chunk {
146            index,
147            total: 0,
148            focus: focus.clone(),
149            tokens: total_tokens,
150            files,
151            context: ChunkContext {
152                previous_summary: None,
153                current_focus: focus,
154                next_preview: None,
155                cross_references: Vec::new(),
156                overlap_content: None,
157            },
158        }
159    }
160
161    // =========================================================================
162    // Focus determination
163    // =========================================================================
164
165    fn determine_focus(&self, files: &[RepoFile]) -> String {
166        if files.is_empty() {
167            return "Empty".to_owned();
168        }
169
170        // Try to find common directory
171        let first_path = &files[0].relative_path;
172        if let Some(module) = first_path.split('/').next() {
173            if files.iter().all(|f| f.relative_path.starts_with(module)) {
174                return format!("{} module", module);
175            }
176        }
177
178        // Try to find common language
179        if let Some(lang) = &files[0].language {
180            if files.iter().all(|f| f.language.as_ref() == Some(lang)) {
181                return format!("{} files", lang);
182            }
183        }
184
185        "Mixed content".to_owned()
186    }
187
188    /// Determine focus for file references (avoids requiring owned slice)
189    fn determine_focus_refs(&self, files: &[&RepoFile]) -> String {
190        if files.is_empty() {
191            return "Empty".to_owned();
192        }
193
194        // Try to find common directory
195        let first_path = &files[0].relative_path;
196        if let Some(module) = first_path.split('/').next() {
197            if files.iter().all(|f| f.relative_path.starts_with(module)) {
198                return format!("{} module", module);
199            }
200        }
201
202        // Try to find common language
203        if let Some(lang) = &files[0].language {
204            if files.iter().all(|f| f.language.as_ref() == Some(lang)) {
205                return format!("{} files", lang);
206            }
207        }
208
209        "Mixed content".to_owned()
210    }
211
212    fn determine_symbol_focus(&self, snippets: &[SymbolSnippet]) -> String {
213        if snippets.is_empty() {
214            return "Symbols".to_owned();
215        }
216
217        let mut names: Vec<String> = snippets
218            .iter()
219            .take(3)
220            .map(|snippet| snippet.symbol_name.clone())
221            .collect();
222
223        let suffix = if snippets.len() > names.len() {
224            format!(" +{} more", snippets.len() - names.len())
225        } else {
226            String::new()
227        };
228
229        if names.len() == 1 {
230            format!("Symbol: {}{}", names.remove(0), suffix)
231        } else {
232            format!("Symbols: {}{}", names.join(", "), suffix)
233        }
234    }
235
236    // =========================================================================
237    // Overlap and finalization
238    // =========================================================================
239
240    pub(crate) fn get_overlap_files(&self, files: &[RepoFile]) -> Vec<RepoFile> {
241        // Keep files that might be needed for context
242        // For now, just keep the last file if it's small enough
243        files
244            .last()
245            .filter(|f| f.token_count.get(self.model) < self.overlap_tokens)
246            .cloned()
247            .into_iter()
248            .collect()
249    }
250
251    pub(crate) fn finalize_chunks(&self, mut chunks: Vec<Chunk>, repo: &Repository) -> Vec<Chunk> {
252        let total = chunks.len();
253
254        // First pass: collect the focus strings and overlap content we need
255        let focus_strs: Vec<String> = chunks.iter().map(|c| c.focus.clone()).collect();
256
257        // Extract overlap content from each chunk for the next one
258        let overlap_contents: Vec<Option<String>> = if self.overlap_tokens > 0 {
259            chunks
260                .iter()
261                .map(|chunk| self.extract_overlap_content(chunk))
262                .collect()
263        } else {
264            vec![None; chunks.len()]
265        };
266
267        for (i, chunk) in chunks.iter_mut().enumerate() {
268            chunk.total = total;
269
270            // Add previous summary
271            if i > 0 {
272                chunk.context.previous_summary = Some(format!("Previous: {}", focus_strs[i - 1]));
273
274                // Add overlap content from previous chunk
275                if let Some(ref overlap) = overlap_contents[i - 1] {
276                    chunk.context.overlap_content = Some(format!(
277                        "<!-- [OVERLAP FROM PREVIOUS CHUNK] -->\n{}\n<!-- [END OVERLAP] -->",
278                        overlap
279                    ));
280                }
281            }
282
283            // Add next preview
284            if i + 1 < total {
285                chunk.context.next_preview = Some(format!("Next: Chunk {}", i + 2));
286            }
287        }
288
289        self.populate_cross_references(&mut chunks, repo);
290
291        chunks
292    }
293
294    fn populate_cross_references(&self, chunks: &mut [Chunk], repo: &Repository) {
295        const MAX_REFS: usize = 25;
296
297        #[derive(Clone)]
298        struct SymbolLocation {
299            chunk_index: usize,
300            file: String,
301        }
302
303        let file_lookup: HashMap<&str, &RepoFile> = repo
304            .files
305            .iter()
306            .map(|file| (file.relative_path.as_str(), file))
307            .collect();
308
309        let mut symbol_index: HashMap<String, Vec<SymbolLocation>> = HashMap::new();
310        let mut seen_symbols: HashSet<(String, usize, String)> = HashSet::new();
311
312        for (chunk_index, chunk) in chunks.iter().enumerate() {
313            for chunk_file in &chunk.files {
314                if let Some(repo_file) = file_lookup.get(chunk_file.path.as_str()) {
315                    for symbol in &repo_file.symbols {
316                        if symbol.kind == SymbolKind::Import {
317                            continue;
318                        }
319                        let key = (symbol.name.clone(), chunk_index, chunk_file.path.clone());
320                        if seen_symbols.insert(key) {
321                            symbol_index.entry(symbol.name.clone()).or_default().push(
322                                SymbolLocation { chunk_index, file: chunk_file.path.clone() },
323                            );
324                        }
325                    }
326                }
327            }
328        }
329
330        for (chunk_index, chunk) in chunks.iter_mut().enumerate() {
331            let mut refs: Vec<CrossReference> = Vec::new();
332            let mut seen_refs: HashSet<(String, usize, String)> = HashSet::new();
333
334            'files: for chunk_file in &chunk.files {
335                if let Some(repo_file) = file_lookup.get(chunk_file.path.as_str()) {
336                    for symbol in &repo_file.symbols {
337                        for called in &symbol.calls {
338                            if let Some(targets) = symbol_index.get(called) {
339                                for target in targets {
340                                    if target.chunk_index == chunk_index {
341                                        continue;
342                                    }
343                                    let key = (
344                                        called.to_owned(),
345                                        target.chunk_index,
346                                        target.file.clone(),
347                                    );
348                                    if seen_refs.insert(key) {
349                                        refs.push(CrossReference {
350                                            symbol: called.to_owned(),
351                                            chunk_index: target.chunk_index,
352                                            file: target.file.clone(),
353                                        });
354                                        if refs.len() >= MAX_REFS {
355                                            break 'files;
356                                        }
357                                    }
358                                }
359                            }
360                        }
361
362                        if let Some(ref base) = symbol.extends {
363                            if let Some(targets) = symbol_index.get(base) {
364                                for target in targets {
365                                    if target.chunk_index == chunk_index {
366                                        continue;
367                                    }
368                                    let key =
369                                        (base.to_owned(), target.chunk_index, target.file.clone());
370                                    if seen_refs.insert(key) {
371                                        refs.push(CrossReference {
372                                            symbol: base.to_owned(),
373                                            chunk_index: target.chunk_index,
374                                            file: target.file.clone(),
375                                        });
376                                        if refs.len() >= MAX_REFS {
377                                            break 'files;
378                                        }
379                                    }
380                                }
381                            }
382                        }
383
384                        for iface in &symbol.implements {
385                            if let Some(targets) = symbol_index.get(iface) {
386                                for target in targets {
387                                    if target.chunk_index == chunk_index {
388                                        continue;
389                                    }
390                                    let key =
391                                        (iface.to_owned(), target.chunk_index, target.file.clone());
392                                    if seen_refs.insert(key) {
393                                        refs.push(CrossReference {
394                                            symbol: iface.to_owned(),
395                                            chunk_index: target.chunk_index,
396                                            file: target.file.clone(),
397                                        });
398                                        if refs.len() >= MAX_REFS {
399                                            break 'files;
400                                        }
401                                    }
402                                }
403                            }
404                        }
405                    }
406                }
407            }
408
409            refs.sort_by(|a, b| {
410                a.chunk_index
411                    .cmp(&b.chunk_index)
412                    .then_with(|| a.symbol.cmp(&b.symbol))
413                    .then_with(|| a.file.cmp(&b.file))
414            });
415            if refs.len() > MAX_REFS {
416                refs.truncate(MAX_REFS);
417            }
418
419            chunk.context.cross_references = refs;
420        }
421    }
422
423    /// Extract content from the end of a chunk for overlap
424    fn extract_overlap_content(&self, chunk: &Chunk) -> Option<String> {
425        if self.overlap_tokens == 0 || chunk.files.is_empty() {
426            return None;
427        }
428
429        let tokenizer = Tokenizer::new();
430        let mut overlap_parts = Vec::new();
431        let mut remaining_tokens = self.overlap_tokens;
432        let token_model = self.model;
433
434        // Take content from the last files until we've accumulated enough tokens
435        for file in chunk.files.iter().rev() {
436            if remaining_tokens == 0 {
437                break;
438            }
439
440            let file_tokens = tokenizer.count(&file.content, token_model);
441            if file_tokens <= remaining_tokens {
442                // Include entire file
443                overlap_parts.push(format!("// From: {}\n{}", file.path, file.content));
444                remaining_tokens = remaining_tokens.saturating_sub(file_tokens);
445            } else {
446                // Include partial file (last N lines that fit)
447                let lines: Vec<&str> = file.content.lines().collect();
448                let mut partial_lines = Vec::new();
449                let mut partial_tokens = 0u32;
450
451                for line in lines.iter().rev() {
452                    let line_tokens = tokenizer.count(line, token_model);
453                    if partial_tokens + line_tokens > remaining_tokens {
454                        break;
455                    }
456                    partial_lines.push(*line);
457                    partial_tokens += line_tokens;
458                }
459
460                if !partial_lines.is_empty() {
461                    partial_lines.reverse();
462                    let partial_content = partial_lines.join("\n");
463                    overlap_parts
464                        .push(format!("// From: {} (partial)\n{}", file.path, partial_content));
465                }
466                remaining_tokens = 0;
467            }
468        }
469
470        if overlap_parts.is_empty() {
471            None
472        } else {
473            overlap_parts.reverse();
474            Some(overlap_parts.join("\n\n"))
475        }
476    }
477}
478
479#[cfg(test)]
480#[allow(clippy::str_to_string)]
481mod tests {
482    use super::*;
483    use crate::types::{Symbol, SymbolKind, TokenCounts, Visibility};
484
485    fn create_test_repo() -> Repository {
486        let mut repo = Repository::new("test", "/tmp/test");
487
488        for i in 0..5 {
489            repo.files.push(RepoFile {
490                path: format!("/tmp/test/src/file{}.py", i).into(),
491                relative_path: format!("src/file{}.py", i),
492                language: Some("python".to_string()),
493                size_bytes: 1000,
494                token_count: TokenCounts {
495                    o200k: 480,
496                    cl100k: 490,
497                    claude: 500,
498                    gemini: 470,
499                    llama: 460,
500                    mistral: 460,
501                    deepseek: 460,
502                    qwen: 460,
503                    cohere: 465,
504                    grok: 460,
505                },
506                symbols: Vec::new(),
507                importance: 0.5,
508                content: Some(format!("# File {}\ndef func{}(): pass", i, i)),
509            });
510        }
511
512        repo
513    }
514
515    #[test]
516    fn test_fixed_chunking() {
517        let repo = create_test_repo();
518        let chunker = Chunker::new(ChunkStrategy::Fixed { size: 1000 }, 1000);
519        let chunks = chunker.chunk(&repo);
520
521        assert!(!chunks.is_empty());
522        assert!(chunks
523            .iter()
524            .all(|c| c.tokens <= 1000 || c.files.len() == 1));
525    }
526
527    #[test]
528    fn test_file_chunking() {
529        let repo = create_test_repo();
530        let chunker = Chunker::new(ChunkStrategy::File, 8000);
531        let chunks = chunker.chunk(&repo);
532
533        assert_eq!(chunks.len(), repo.files.len());
534    }
535
536    #[test]
537    fn test_semantic_chunking() {
538        let repo = create_test_repo();
539        let chunker = Chunker::new(ChunkStrategy::Semantic, 2000);
540        let chunks = chunker.chunk(&repo);
541
542        assert!(!chunks.is_empty());
543        // All chunks should have correct total
544        assert!(chunks.iter().all(|c| c.total == chunks.len()));
545    }
546
547    #[test]
548    fn test_symbol_chunking() {
549        let mut repo = create_test_repo();
550        if let Some(file) = repo.files.get_mut(0) {
551            let mut symbol = Symbol::new("func0", SymbolKind::Function);
552            symbol.start_line = 1;
553            symbol.end_line = 1;
554            symbol.visibility = Visibility::Public;
555            file.symbols.push(symbol);
556        }
557
558        let chunker = Chunker::new(ChunkStrategy::Symbol, 500);
559        let chunks = chunker.chunk(&repo);
560
561        assert!(!chunks.is_empty());
562        assert!(chunks.iter().all(|c| c.total == chunks.len()));
563    }
564}