1mod strategies;
7mod types;
8
9pub use types::{Chunk, ChunkContext, ChunkFile, ChunkStrategy, Chunker, CrossReference};
10use types::SymbolSnippet;
11
12use crate::tokenizer::Tokenizer;
13use crate::types::{RepoFile, Repository, SymbolKind, TokenizerModel};
14use std::collections::{BTreeMap, HashMap, HashSet};
15
16impl Chunker {
17 pub fn new(strategy: ChunkStrategy, max_tokens: u32) -> Self {
19 Self { strategy, max_tokens, overlap_tokens: 200, model: TokenizerModel::Claude }
20 }
21
22 pub fn with_overlap(mut self, tokens: u32) -> Self {
24 self.overlap_tokens = tokens;
25 self
26 }
27
28 pub fn with_model(mut self, model: TokenizerModel) -> Self {
30 self.model = model;
31 self
32 }
33
34 pub fn chunk(&self, repo: &Repository) -> Vec<Chunk> {
36 match self.strategy {
37 ChunkStrategy::Fixed { size } => self.fixed_chunk(repo, size),
38 ChunkStrategy::File => self.file_chunk(repo),
39 ChunkStrategy::Module => self.module_chunk(repo),
40 ChunkStrategy::Symbol => self.symbol_chunk(repo),
41 ChunkStrategy::Semantic => self.semantic_chunk(repo),
42 ChunkStrategy::Dependency => self.dependency_chunk(repo),
43 }
44 }
45
46 pub(crate) fn create_chunk(&self, index: usize, files: &[RepoFile], tokens: u32) -> Chunk {
51 let focus = self.determine_focus(files);
52
53 Chunk {
54 index,
55 total: 0, focus: focus.clone(),
57 tokens,
58 files: files
59 .iter()
60 .map(|f| ChunkFile {
61 path: f.relative_path.clone(),
62 content: f.content.clone().unwrap_or_default(),
63 tokens: f.token_count.get(self.model),
64 truncated: false,
65 })
66 .collect(),
67 context: ChunkContext {
68 previous_summary: None,
69 current_focus: focus,
70 next_preview: None,
71 cross_references: Vec::new(),
72 overlap_content: None,
73 },
74 }
75 }
76
77 pub(crate) fn create_chunk_from_refs(&self, index: usize, files: &[&RepoFile], tokens: u32) -> Chunk {
79 let focus = self.determine_focus_refs(files);
80
81 Chunk {
82 index,
83 total: 0, focus: focus.clone(),
85 tokens,
86 files: files
87 .iter()
88 .map(|f| ChunkFile {
89 path: f.relative_path.clone(),
90 content: f.content.clone().unwrap_or_default(),
91 tokens: f.token_count.get(self.model),
92 truncated: false,
93 })
94 .collect(),
95 context: ChunkContext {
96 previous_summary: None,
97 current_focus: focus,
98 next_preview: None,
99 cross_references: Vec::new(),
100 overlap_content: None,
101 },
102 }
103 }
104
105 pub(crate) fn build_symbol_chunk(
106 &self,
107 index: usize,
108 snippets: &[SymbolSnippet],
109 tokenizer: &Tokenizer,
110 ) -> Chunk {
111 let focus = self.determine_symbol_focus(snippets);
112 let mut by_file: BTreeMap<&str, Vec<&SymbolSnippet>> = BTreeMap::new();
113
114 for snippet in snippets {
115 by_file
116 .entry(snippet.file_path.as_str())
117 .or_default()
118 .push(snippet);
119 }
120
121 let mut files = Vec::new();
122 let mut total_tokens = 0u32;
123
124 for (path, mut entries) in by_file {
125 entries.sort_by(|a, b| {
126 a.start_line
127 .cmp(&b.start_line)
128 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
129 });
130
131 let mut content = String::new();
132 for entry in entries {
133 if !content.is_empty() {
134 content.push_str("\n\n");
135 }
136 content.push_str(&entry.content);
137 }
138
139 let tokens = tokenizer.count(&content, self.model);
140 total_tokens += tokens;
141
142 files.push(ChunkFile { path: path.to_owned(), content, tokens, truncated: false });
143 }
144
145 Chunk {
146 index,
147 total: 0,
148 focus: focus.clone(),
149 tokens: total_tokens,
150 files,
151 context: ChunkContext {
152 previous_summary: None,
153 current_focus: focus,
154 next_preview: None,
155 cross_references: Vec::new(),
156 overlap_content: None,
157 },
158 }
159 }
160
161 fn determine_focus(&self, files: &[RepoFile]) -> String {
166 if files.is_empty() {
167 return "Empty".to_owned();
168 }
169
170 let first_path = &files[0].relative_path;
172 if let Some(module) = first_path.split('/').next() {
173 if files.iter().all(|f| f.relative_path.starts_with(module)) {
174 return format!("{} module", module);
175 }
176 }
177
178 if let Some(lang) = &files[0].language {
180 if files.iter().all(|f| f.language.as_ref() == Some(lang)) {
181 return format!("{} files", lang);
182 }
183 }
184
185 "Mixed content".to_owned()
186 }
187
188 fn determine_focus_refs(&self, files: &[&RepoFile]) -> String {
190 if files.is_empty() {
191 return "Empty".to_owned();
192 }
193
194 let first_path = &files[0].relative_path;
196 if let Some(module) = first_path.split('/').next() {
197 if files.iter().all(|f| f.relative_path.starts_with(module)) {
198 return format!("{} module", module);
199 }
200 }
201
202 if let Some(lang) = &files[0].language {
204 if files.iter().all(|f| f.language.as_ref() == Some(lang)) {
205 return format!("{} files", lang);
206 }
207 }
208
209 "Mixed content".to_owned()
210 }
211
212 fn determine_symbol_focus(&self, snippets: &[SymbolSnippet]) -> String {
213 if snippets.is_empty() {
214 return "Symbols".to_owned();
215 }
216
217 let mut names: Vec<String> = snippets
218 .iter()
219 .take(3)
220 .map(|snippet| snippet.symbol_name.clone())
221 .collect();
222
223 let suffix = if snippets.len() > names.len() {
224 format!(" +{} more", snippets.len() - names.len())
225 } else {
226 String::new()
227 };
228
229 if names.len() == 1 {
230 format!("Symbol: {}{}", names.remove(0), suffix)
231 } else {
232 format!("Symbols: {}{}", names.join(", "), suffix)
233 }
234 }
235
236 pub(crate) fn get_overlap_files(&self, files: &[RepoFile]) -> Vec<RepoFile> {
241 files
244 .last()
245 .filter(|f| f.token_count.get(self.model) < self.overlap_tokens)
246 .cloned()
247 .into_iter()
248 .collect()
249 }
250
251 pub(crate) fn finalize_chunks(&self, mut chunks: Vec<Chunk>, repo: &Repository) -> Vec<Chunk> {
252 let total = chunks.len();
253
254 let focus_strs: Vec<String> = chunks.iter().map(|c| c.focus.clone()).collect();
256
257 let overlap_contents: Vec<Option<String>> = if self.overlap_tokens > 0 {
259 chunks
260 .iter()
261 .map(|chunk| self.extract_overlap_content(chunk))
262 .collect()
263 } else {
264 vec![None; chunks.len()]
265 };
266
267 for (i, chunk) in chunks.iter_mut().enumerate() {
268 chunk.total = total;
269
270 if i > 0 {
272 chunk.context.previous_summary = Some(format!("Previous: {}", focus_strs[i - 1]));
273
274 if let Some(ref overlap) = overlap_contents[i - 1] {
276 chunk.context.overlap_content = Some(format!(
277 "<!-- [OVERLAP FROM PREVIOUS CHUNK] -->\n{}\n<!-- [END OVERLAP] -->",
278 overlap
279 ));
280 }
281 }
282
283 if i + 1 < total {
285 chunk.context.next_preview = Some(format!("Next: Chunk {}", i + 2));
286 }
287 }
288
289 self.populate_cross_references(&mut chunks, repo);
290
291 chunks
292 }
293
294 fn populate_cross_references(&self, chunks: &mut [Chunk], repo: &Repository) {
295 const MAX_REFS: usize = 25;
296
297 #[derive(Clone)]
298 struct SymbolLocation {
299 chunk_index: usize,
300 file: String,
301 }
302
303 let file_lookup: HashMap<&str, &RepoFile> = repo
304 .files
305 .iter()
306 .map(|file| (file.relative_path.as_str(), file))
307 .collect();
308
309 let mut symbol_index: HashMap<String, Vec<SymbolLocation>> = HashMap::new();
310 let mut seen_symbols: HashSet<(String, usize, String)> = HashSet::new();
311
312 for (chunk_index, chunk) in chunks.iter().enumerate() {
313 for chunk_file in &chunk.files {
314 if let Some(repo_file) = file_lookup.get(chunk_file.path.as_str()) {
315 for symbol in &repo_file.symbols {
316 if symbol.kind == SymbolKind::Import {
317 continue;
318 }
319 let key = (symbol.name.clone(), chunk_index, chunk_file.path.clone());
320 if seen_symbols.insert(key) {
321 symbol_index.entry(symbol.name.clone()).or_default().push(
322 SymbolLocation { chunk_index, file: chunk_file.path.clone() },
323 );
324 }
325 }
326 }
327 }
328 }
329
330 for (chunk_index, chunk) in chunks.iter_mut().enumerate() {
331 let mut refs: Vec<CrossReference> = Vec::new();
332 let mut seen_refs: HashSet<(String, usize, String)> = HashSet::new();
333
334 'files: for chunk_file in &chunk.files {
335 if let Some(repo_file) = file_lookup.get(chunk_file.path.as_str()) {
336 for symbol in &repo_file.symbols {
337 for called in &symbol.calls {
338 if let Some(targets) = symbol_index.get(called) {
339 for target in targets {
340 if target.chunk_index == chunk_index {
341 continue;
342 }
343 let key = (
344 called.to_owned(),
345 target.chunk_index,
346 target.file.clone(),
347 );
348 if seen_refs.insert(key) {
349 refs.push(CrossReference {
350 symbol: called.to_owned(),
351 chunk_index: target.chunk_index,
352 file: target.file.clone(),
353 });
354 if refs.len() >= MAX_REFS {
355 break 'files;
356 }
357 }
358 }
359 }
360 }
361
362 if let Some(ref base) = symbol.extends {
363 if let Some(targets) = symbol_index.get(base) {
364 for target in targets {
365 if target.chunk_index == chunk_index {
366 continue;
367 }
368 let key =
369 (base.to_owned(), target.chunk_index, target.file.clone());
370 if seen_refs.insert(key) {
371 refs.push(CrossReference {
372 symbol: base.to_owned(),
373 chunk_index: target.chunk_index,
374 file: target.file.clone(),
375 });
376 if refs.len() >= MAX_REFS {
377 break 'files;
378 }
379 }
380 }
381 }
382 }
383
384 for iface in &symbol.implements {
385 if let Some(targets) = symbol_index.get(iface) {
386 for target in targets {
387 if target.chunk_index == chunk_index {
388 continue;
389 }
390 let key =
391 (iface.to_owned(), target.chunk_index, target.file.clone());
392 if seen_refs.insert(key) {
393 refs.push(CrossReference {
394 symbol: iface.to_owned(),
395 chunk_index: target.chunk_index,
396 file: target.file.clone(),
397 });
398 if refs.len() >= MAX_REFS {
399 break 'files;
400 }
401 }
402 }
403 }
404 }
405 }
406 }
407 }
408
409 refs.sort_by(|a, b| {
410 a.chunk_index
411 .cmp(&b.chunk_index)
412 .then_with(|| a.symbol.cmp(&b.symbol))
413 .then_with(|| a.file.cmp(&b.file))
414 });
415 if refs.len() > MAX_REFS {
416 refs.truncate(MAX_REFS);
417 }
418
419 chunk.context.cross_references = refs;
420 }
421 }
422
423 fn extract_overlap_content(&self, chunk: &Chunk) -> Option<String> {
425 if self.overlap_tokens == 0 || chunk.files.is_empty() {
426 return None;
427 }
428
429 let tokenizer = Tokenizer::new();
430 let mut overlap_parts = Vec::new();
431 let mut remaining_tokens = self.overlap_tokens;
432 let token_model = self.model;
433
434 for file in chunk.files.iter().rev() {
436 if remaining_tokens == 0 {
437 break;
438 }
439
440 let file_tokens = tokenizer.count(&file.content, token_model);
441 if file_tokens <= remaining_tokens {
442 overlap_parts.push(format!("// From: {}\n{}", file.path, file.content));
444 remaining_tokens = remaining_tokens.saturating_sub(file_tokens);
445 } else {
446 let lines: Vec<&str> = file.content.lines().collect();
448 let mut partial_lines = Vec::new();
449 let mut partial_tokens = 0u32;
450
451 for line in lines.iter().rev() {
452 let line_tokens = tokenizer.count(line, token_model);
453 if partial_tokens + line_tokens > remaining_tokens {
454 break;
455 }
456 partial_lines.push(*line);
457 partial_tokens += line_tokens;
458 }
459
460 if !partial_lines.is_empty() {
461 partial_lines.reverse();
462 let partial_content = partial_lines.join("\n");
463 overlap_parts
464 .push(format!("// From: {} (partial)\n{}", file.path, partial_content));
465 }
466 remaining_tokens = 0;
467 }
468 }
469
470 if overlap_parts.is_empty() {
471 None
472 } else {
473 overlap_parts.reverse();
474 Some(overlap_parts.join("\n\n"))
475 }
476 }
477}
478
479#[cfg(test)]
480#[allow(clippy::str_to_string)]
481mod tests {
482 use super::*;
483 use crate::types::{Symbol, SymbolKind, TokenCounts, Visibility};
484
485 fn create_test_repo() -> Repository {
486 let mut repo = Repository::new("test", "/tmp/test");
487
488 for i in 0..5 {
489 repo.files.push(RepoFile {
490 path: format!("/tmp/test/src/file{}.py", i).into(),
491 relative_path: format!("src/file{}.py", i),
492 language: Some("python".to_string()),
493 size_bytes: 1000,
494 token_count: TokenCounts {
495 o200k: 480,
496 cl100k: 490,
497 claude: 500,
498 gemini: 470,
499 llama: 460,
500 mistral: 460,
501 deepseek: 460,
502 qwen: 460,
503 cohere: 465,
504 grok: 460,
505 },
506 symbols: Vec::new(),
507 importance: 0.5,
508 content: Some(format!("# File {}\ndef func{}(): pass", i, i)),
509 });
510 }
511
512 repo
513 }
514
515 #[test]
516 fn test_fixed_chunking() {
517 let repo = create_test_repo();
518 let chunker = Chunker::new(ChunkStrategy::Fixed { size: 1000 }, 1000);
519 let chunks = chunker.chunk(&repo);
520
521 assert!(!chunks.is_empty());
522 assert!(chunks
523 .iter()
524 .all(|c| c.tokens <= 1000 || c.files.len() == 1));
525 }
526
527 #[test]
528 fn test_file_chunking() {
529 let repo = create_test_repo();
530 let chunker = Chunker::new(ChunkStrategy::File, 8000);
531 let chunks = chunker.chunk(&repo);
532
533 assert_eq!(chunks.len(), repo.files.len());
534 }
535
536 #[test]
537 fn test_semantic_chunking() {
538 let repo = create_test_repo();
539 let chunker = Chunker::new(ChunkStrategy::Semantic, 2000);
540 let chunks = chunker.chunk(&repo);
541
542 assert!(!chunks.is_empty());
543 assert!(chunks.iter().all(|c| c.total == chunks.len()));
545 }
546
547 #[test]
548 fn test_symbol_chunking() {
549 let mut repo = create_test_repo();
550 if let Some(file) = repo.files.get_mut(0) {
551 let mut symbol = Symbol::new("func0", SymbolKind::Function);
552 symbol.start_line = 1;
553 symbol.end_line = 1;
554 symbol.visibility = Visibility::Public;
555 file.symbols.push(symbol);
556 }
557
558 let chunker = Chunker::new(ChunkStrategy::Symbol, 500);
559 let chunks = chunker.chunk(&repo);
560
561 assert!(!chunks.is_empty());
562 assert!(chunks.iter().all(|c| c.total == chunks.len()));
563 }
564}