infiniloom_engine/chunking/
mod.rs

1//! Intelligent code chunking for LLM context windows
2//!
3//! This module provides various strategies for splitting repositories into
4//! chunks that fit within LLM context windows while preserving semantic coherence.
5
6mod strategies;
7mod types;
8
9use types::SymbolSnippet;
10pub use types::{Chunk, ChunkContext, ChunkFile, ChunkStrategy, Chunker, CrossReference};
11
12use crate::tokenizer::Tokenizer;
13use crate::types::{RepoFile, Repository, SymbolKind, TokenizerModel};
14use std::collections::{BTreeMap, HashMap, HashSet};
15
16impl Chunker {
17    /// Create a new chunker
18    pub fn new(strategy: ChunkStrategy, max_tokens: u32) -> Self {
19        Self { strategy, max_tokens, overlap_tokens: 200, model: TokenizerModel::Claude }
20    }
21
22    /// Set overlap tokens
23    pub fn with_overlap(mut self, tokens: u32) -> Self {
24        self.overlap_tokens = tokens;
25        self
26    }
27
28    /// Set target model
29    pub fn with_model(mut self, model: TokenizerModel) -> Self {
30        self.model = model;
31        self
32    }
33
34    /// Chunk a repository
35    pub fn chunk(&self, repo: &Repository) -> Vec<Chunk> {
36        match self.strategy {
37            ChunkStrategy::Fixed { size } => self.fixed_chunk(repo, size),
38            ChunkStrategy::File => self.file_chunk(repo),
39            ChunkStrategy::Module => self.module_chunk(repo),
40            ChunkStrategy::Symbol => self.symbol_chunk(repo),
41            ChunkStrategy::Semantic => self.semantic_chunk(repo),
42            ChunkStrategy::Dependency => self.dependency_chunk(repo),
43        }
44    }
45
46    // =========================================================================
47    // Chunk creation helpers
48    // =========================================================================
49
50    pub(crate) fn create_chunk(&self, index: usize, files: &[RepoFile], tokens: u32) -> Chunk {
51        let focus = self.determine_focus(files);
52
53        Chunk {
54            index,
55            total: 0, // Updated in finalize
56            focus: focus.clone(),
57            tokens,
58            files: files
59                .iter()
60                .map(|f| ChunkFile {
61                    path: f.relative_path.clone(),
62                    content: f.content.clone().unwrap_or_default(),
63                    tokens: f.token_count.get(self.model),
64                    truncated: false,
65                })
66                .collect(),
67            context: ChunkContext {
68                previous_summary: None,
69                current_focus: focus,
70                next_preview: None,
71                cross_references: Vec::new(),
72                overlap_content: None,
73            },
74        }
75    }
76
77    /// Create a chunk from file references (avoids cloning RepoFile)
78    pub(crate) fn create_chunk_from_refs(
79        &self,
80        index: usize,
81        files: &[&RepoFile],
82        tokens: u32,
83    ) -> Chunk {
84        let focus = self.determine_focus_refs(files);
85
86        Chunk {
87            index,
88            total: 0, // Updated in finalize
89            focus: focus.clone(),
90            tokens,
91            files: files
92                .iter()
93                .map(|f| ChunkFile {
94                    path: f.relative_path.clone(),
95                    content: f.content.clone().unwrap_or_default(),
96                    tokens: f.token_count.get(self.model),
97                    truncated: false,
98                })
99                .collect(),
100            context: ChunkContext {
101                previous_summary: None,
102                current_focus: focus,
103                next_preview: None,
104                cross_references: Vec::new(),
105                overlap_content: None,
106            },
107        }
108    }
109
110    pub(crate) fn build_symbol_chunk(
111        &self,
112        index: usize,
113        snippets: &[SymbolSnippet],
114        tokenizer: &Tokenizer,
115    ) -> Chunk {
116        let focus = self.determine_symbol_focus(snippets);
117        let mut by_file: BTreeMap<&str, Vec<&SymbolSnippet>> = BTreeMap::new();
118
119        for snippet in snippets {
120            by_file
121                .entry(snippet.file_path.as_str())
122                .or_default()
123                .push(snippet);
124        }
125
126        let mut files = Vec::new();
127        let mut total_tokens = 0u32;
128
129        for (path, mut entries) in by_file {
130            entries.sort_by(|a, b| {
131                a.start_line
132                    .cmp(&b.start_line)
133                    .then_with(|| a.symbol_name.cmp(&b.symbol_name))
134            });
135
136            let mut content = String::new();
137            for entry in entries {
138                if !content.is_empty() {
139                    content.push_str("\n\n");
140                }
141                content.push_str(&entry.content);
142            }
143
144            let tokens = tokenizer.count(&content, self.model);
145            total_tokens += tokens;
146
147            files.push(ChunkFile { path: path.to_owned(), content, tokens, truncated: false });
148        }
149
150        Chunk {
151            index,
152            total: 0,
153            focus: focus.clone(),
154            tokens: total_tokens,
155            files,
156            context: ChunkContext {
157                previous_summary: None,
158                current_focus: focus,
159                next_preview: None,
160                cross_references: Vec::new(),
161                overlap_content: None,
162            },
163        }
164    }
165
166    // =========================================================================
167    // Focus determination
168    // =========================================================================
169
170    fn determine_focus(&self, files: &[RepoFile]) -> String {
171        if files.is_empty() {
172            return "Empty".to_owned();
173        }
174
175        // Try to find common directory
176        let first_path = &files[0].relative_path;
177        if let Some(module) = first_path.split('/').next() {
178            if files.iter().all(|f| f.relative_path.starts_with(module)) {
179                return format!("{} module", module);
180            }
181        }
182
183        // Try to find common language
184        if let Some(lang) = &files[0].language {
185            if files.iter().all(|f| f.language.as_ref() == Some(lang)) {
186                return format!("{} files", lang);
187            }
188        }
189
190        "Mixed content".to_owned()
191    }
192
193    /// Determine focus for file references (avoids requiring owned slice)
194    fn determine_focus_refs(&self, files: &[&RepoFile]) -> String {
195        if files.is_empty() {
196            return "Empty".to_owned();
197        }
198
199        // Try to find common directory
200        let first_path = &files[0].relative_path;
201        if let Some(module) = first_path.split('/').next() {
202            if files.iter().all(|f| f.relative_path.starts_with(module)) {
203                return format!("{} module", module);
204            }
205        }
206
207        // Try to find common language
208        if let Some(lang) = &files[0].language {
209            if files.iter().all(|f| f.language.as_ref() == Some(lang)) {
210                return format!("{} files", lang);
211            }
212        }
213
214        "Mixed content".to_owned()
215    }
216
217    fn determine_symbol_focus(&self, snippets: &[SymbolSnippet]) -> String {
218        if snippets.is_empty() {
219            return "Symbols".to_owned();
220        }
221
222        let mut names: Vec<String> = snippets
223            .iter()
224            .take(3)
225            .map(|snippet| snippet.symbol_name.clone())
226            .collect();
227
228        let suffix = if snippets.len() > names.len() {
229            format!(" +{} more", snippets.len() - names.len())
230        } else {
231            String::new()
232        };
233
234        if names.len() == 1 {
235            format!("Symbol: {}{}", names.remove(0), suffix)
236        } else {
237            format!("Symbols: {}{}", names.join(", "), suffix)
238        }
239    }
240
241    // =========================================================================
242    // Overlap and finalization
243    // =========================================================================
244
245    pub(crate) fn get_overlap_files(&self, files: &[RepoFile]) -> Vec<RepoFile> {
246        // Keep files that might be needed for context
247        // For now, just keep the last file if it's small enough
248        files
249            .last()
250            .filter(|f| f.token_count.get(self.model) < self.overlap_tokens)
251            .cloned()
252            .into_iter()
253            .collect()
254    }
255
256    pub(crate) fn finalize_chunks(&self, mut chunks: Vec<Chunk>, repo: &Repository) -> Vec<Chunk> {
257        let total = chunks.len();
258
259        // First pass: collect the focus strings and overlap content we need
260        let focus_strs: Vec<String> = chunks.iter().map(|c| c.focus.clone()).collect();
261
262        // Extract overlap content from each chunk for the next one
263        let overlap_contents: Vec<Option<String>> = if self.overlap_tokens > 0 {
264            chunks
265                .iter()
266                .map(|chunk| self.extract_overlap_content(chunk))
267                .collect()
268        } else {
269            vec![None; chunks.len()]
270        };
271
272        for (i, chunk) in chunks.iter_mut().enumerate() {
273            chunk.total = total;
274
275            // Add previous summary
276            if i > 0 {
277                chunk.context.previous_summary = Some(format!("Previous: {}", focus_strs[i - 1]));
278
279                // Add overlap content from previous chunk
280                if let Some(ref overlap) = overlap_contents[i - 1] {
281                    chunk.context.overlap_content = Some(format!(
282                        "<!-- [OVERLAP FROM PREVIOUS CHUNK] -->\n{}\n<!-- [END OVERLAP] -->",
283                        overlap
284                    ));
285                }
286            }
287
288            // Add next preview
289            if i + 1 < total {
290                chunk.context.next_preview = Some(format!("Next: Chunk {}", i + 2));
291            }
292        }
293
294        self.populate_cross_references(&mut chunks, repo);
295
296        chunks
297    }
298
299    fn populate_cross_references(&self, chunks: &mut [Chunk], repo: &Repository) {
300        const MAX_REFS: usize = 25;
301
302        #[derive(Clone)]
303        struct SymbolLocation {
304            chunk_index: usize,
305            file: String,
306        }
307
308        let file_lookup: HashMap<&str, &RepoFile> = repo
309            .files
310            .iter()
311            .map(|file| (file.relative_path.as_str(), file))
312            .collect();
313
314        let mut symbol_index: HashMap<String, Vec<SymbolLocation>> = HashMap::new();
315        let mut seen_symbols: HashSet<(String, usize, String)> = HashSet::new();
316
317        for (chunk_index, chunk) in chunks.iter().enumerate() {
318            for chunk_file in &chunk.files {
319                if let Some(repo_file) = file_lookup.get(chunk_file.path.as_str()) {
320                    for symbol in &repo_file.symbols {
321                        if symbol.kind == SymbolKind::Import {
322                            continue;
323                        }
324                        let key = (symbol.name.clone(), chunk_index, chunk_file.path.clone());
325                        if seen_symbols.insert(key) {
326                            symbol_index.entry(symbol.name.clone()).or_default().push(
327                                SymbolLocation { chunk_index, file: chunk_file.path.clone() },
328                            );
329                        }
330                    }
331                }
332            }
333        }
334
335        for (chunk_index, chunk) in chunks.iter_mut().enumerate() {
336            let mut refs: Vec<CrossReference> = Vec::new();
337            let mut seen_refs: HashSet<(String, usize, String)> = HashSet::new();
338
339            'files: for chunk_file in &chunk.files {
340                if let Some(repo_file) = file_lookup.get(chunk_file.path.as_str()) {
341                    for symbol in &repo_file.symbols {
342                        for called in &symbol.calls {
343                            if let Some(targets) = symbol_index.get(called) {
344                                for target in targets {
345                                    if target.chunk_index == chunk_index {
346                                        continue;
347                                    }
348                                    let key = (
349                                        called.to_owned(),
350                                        target.chunk_index,
351                                        target.file.clone(),
352                                    );
353                                    if seen_refs.insert(key) {
354                                        refs.push(CrossReference {
355                                            symbol: called.to_owned(),
356                                            chunk_index: target.chunk_index,
357                                            file: target.file.clone(),
358                                        });
359                                        if refs.len() >= MAX_REFS {
360                                            break 'files;
361                                        }
362                                    }
363                                }
364                            }
365                        }
366
367                        if let Some(ref base) = symbol.extends {
368                            if let Some(targets) = symbol_index.get(base) {
369                                for target in targets {
370                                    if target.chunk_index == chunk_index {
371                                        continue;
372                                    }
373                                    let key =
374                                        (base.to_owned(), target.chunk_index, target.file.clone());
375                                    if seen_refs.insert(key) {
376                                        refs.push(CrossReference {
377                                            symbol: base.to_owned(),
378                                            chunk_index: target.chunk_index,
379                                            file: target.file.clone(),
380                                        });
381                                        if refs.len() >= MAX_REFS {
382                                            break 'files;
383                                        }
384                                    }
385                                }
386                            }
387                        }
388
389                        for iface in &symbol.implements {
390                            if let Some(targets) = symbol_index.get(iface) {
391                                for target in targets {
392                                    if target.chunk_index == chunk_index {
393                                        continue;
394                                    }
395                                    let key =
396                                        (iface.to_owned(), target.chunk_index, target.file.clone());
397                                    if seen_refs.insert(key) {
398                                        refs.push(CrossReference {
399                                            symbol: iface.to_owned(),
400                                            chunk_index: target.chunk_index,
401                                            file: target.file.clone(),
402                                        });
403                                        if refs.len() >= MAX_REFS {
404                                            break 'files;
405                                        }
406                                    }
407                                }
408                            }
409                        }
410                    }
411                }
412            }
413
414            refs.sort_by(|a, b| {
415                a.chunk_index
416                    .cmp(&b.chunk_index)
417                    .then_with(|| a.symbol.cmp(&b.symbol))
418                    .then_with(|| a.file.cmp(&b.file))
419            });
420            if refs.len() > MAX_REFS {
421                refs.truncate(MAX_REFS);
422            }
423
424            chunk.context.cross_references = refs;
425        }
426    }
427
428    /// Extract content from the end of a chunk for overlap
429    fn extract_overlap_content(&self, chunk: &Chunk) -> Option<String> {
430        if self.overlap_tokens == 0 || chunk.files.is_empty() {
431            return None;
432        }
433
434        let tokenizer = Tokenizer::new();
435        let mut overlap_parts = Vec::new();
436        let mut remaining_tokens = self.overlap_tokens;
437        let token_model = self.model;
438
439        // Take content from the last files until we've accumulated enough tokens
440        for file in chunk.files.iter().rev() {
441            if remaining_tokens == 0 {
442                break;
443            }
444
445            let file_tokens = tokenizer.count(&file.content, token_model);
446            if file_tokens <= remaining_tokens {
447                // Include entire file
448                overlap_parts.push(format!("// From: {}\n{}", file.path, file.content));
449                remaining_tokens = remaining_tokens.saturating_sub(file_tokens);
450            } else {
451                // Include partial file (last N lines that fit)
452                let lines: Vec<&str> = file.content.lines().collect();
453                let mut partial_lines = Vec::new();
454                let mut partial_tokens = 0u32;
455
456                for line in lines.iter().rev() {
457                    let line_tokens = tokenizer.count(line, token_model);
458                    if partial_tokens + line_tokens > remaining_tokens {
459                        break;
460                    }
461                    partial_lines.push(*line);
462                    partial_tokens += line_tokens;
463                }
464
465                if !partial_lines.is_empty() {
466                    partial_lines.reverse();
467                    let partial_content = partial_lines.join("\n");
468                    overlap_parts
469                        .push(format!("// From: {} (partial)\n{}", file.path, partial_content));
470                }
471                remaining_tokens = 0;
472            }
473        }
474
475        if overlap_parts.is_empty() {
476            None
477        } else {
478            overlap_parts.reverse();
479            Some(overlap_parts.join("\n\n"))
480        }
481    }
482}
483
484#[cfg(test)]
485#[allow(clippy::str_to_string)]
486mod tests {
487    use super::*;
488    use crate::types::{Symbol, SymbolKind, TokenCounts, Visibility};
489
490    fn create_test_repo() -> Repository {
491        let mut repo = Repository::new("test", "/tmp/test");
492
493        for i in 0..5 {
494            repo.files.push(RepoFile {
495                path: format!("/tmp/test/src/file{}.py", i).into(),
496                relative_path: format!("src/file{}.py", i),
497                language: Some("python".to_string()),
498                size_bytes: 1000,
499                token_count: TokenCounts {
500                    o200k: 480,
501                    cl100k: 490,
502                    claude: 500,
503                    gemini: 470,
504                    llama: 460,
505                    mistral: 460,
506                    deepseek: 460,
507                    qwen: 460,
508                    cohere: 465,
509                    grok: 460,
510                },
511                symbols: Vec::new(),
512                importance: 0.5,
513                content: Some(format!("# File {}\ndef func{}(): pass", i, i)),
514            });
515        }
516
517        repo
518    }
519
520    #[test]
521    fn test_fixed_chunking() {
522        let repo = create_test_repo();
523        let chunker = Chunker::new(ChunkStrategy::Fixed { size: 1000 }, 1000);
524        let chunks = chunker.chunk(&repo);
525
526        assert!(!chunks.is_empty());
527        assert!(chunks
528            .iter()
529            .all(|c| c.tokens <= 1000 || c.files.len() == 1));
530    }
531
532    #[test]
533    fn test_file_chunking() {
534        let repo = create_test_repo();
535        let chunker = Chunker::new(ChunkStrategy::File, 8000);
536        let chunks = chunker.chunk(&repo);
537
538        assert_eq!(chunks.len(), repo.files.len());
539    }
540
541    #[test]
542    fn test_semantic_chunking() {
543        let repo = create_test_repo();
544        let chunker = Chunker::new(ChunkStrategy::Semantic, 2000);
545        let chunks = chunker.chunk(&repo);
546
547        assert!(!chunks.is_empty());
548        // All chunks should have correct total
549        assert!(chunks.iter().all(|c| c.total == chunks.len()));
550    }
551
552    #[test]
553    fn test_symbol_chunking() {
554        let mut repo = create_test_repo();
555        if let Some(file) = repo.files.get_mut(0) {
556            let mut symbol = Symbol::new("func0", SymbolKind::Function);
557            symbol.start_line = 1;
558            symbol.end_line = 1;
559            symbol.visibility = Visibility::Public;
560            file.symbols.push(symbol);
561        }
562
563        let chunker = Chunker::new(ChunkStrategy::Symbol, 500);
564        let chunks = chunker.chunk(&repo);
565
566        assert!(!chunks.is_empty());
567        assert!(chunks.iter().all(|c| c.total == chunks.len()));
568    }
569}