1mod strategies;
7mod types;
8
9use types::SymbolSnippet;
10pub use types::{Chunk, ChunkContext, ChunkFile, ChunkStrategy, Chunker, CrossReference};
11
12use crate::tokenizer::Tokenizer;
13use crate::types::{RepoFile, Repository, SymbolKind, TokenizerModel};
14use std::collections::{BTreeMap, HashMap, HashSet};
15
16impl Chunker {
17 pub fn new(strategy: ChunkStrategy, max_tokens: u32) -> Self {
19 Self { strategy, max_tokens, overlap_tokens: 200, model: TokenizerModel::Claude }
20 }
21
22 pub fn with_overlap(mut self, tokens: u32) -> Self {
24 self.overlap_tokens = tokens;
25 self
26 }
27
28 pub fn with_model(mut self, model: TokenizerModel) -> Self {
30 self.model = model;
31 self
32 }
33
34 pub fn chunk(&self, repo: &Repository) -> Vec<Chunk> {
36 match self.strategy {
37 ChunkStrategy::Fixed { size } => self.fixed_chunk(repo, size),
38 ChunkStrategy::File => self.file_chunk(repo),
39 ChunkStrategy::Module => self.module_chunk(repo),
40 ChunkStrategy::Symbol => self.symbol_chunk(repo),
41 ChunkStrategy::Semantic => self.semantic_chunk(repo),
42 ChunkStrategy::Dependency => self.dependency_chunk(repo),
43 }
44 }
45
46 pub(crate) fn create_chunk(&self, index: usize, files: &[RepoFile], tokens: u32) -> Chunk {
51 let focus = self.determine_focus(files);
52
53 Chunk {
54 index,
55 total: 0, focus: focus.clone(),
57 tokens,
58 files: files
59 .iter()
60 .map(|f| ChunkFile {
61 path: f.relative_path.clone(),
62 content: f.content.clone().unwrap_or_default(),
63 tokens: f.token_count.get(self.model),
64 truncated: false,
65 })
66 .collect(),
67 context: ChunkContext {
68 previous_summary: None,
69 current_focus: focus,
70 next_preview: None,
71 cross_references: Vec::new(),
72 overlap_content: None,
73 },
74 }
75 }
76
77 pub(crate) fn create_chunk_from_refs(
79 &self,
80 index: usize,
81 files: &[&RepoFile],
82 tokens: u32,
83 ) -> Chunk {
84 let focus = self.determine_focus_refs(files);
85
86 Chunk {
87 index,
88 total: 0, focus: focus.clone(),
90 tokens,
91 files: files
92 .iter()
93 .map(|f| ChunkFile {
94 path: f.relative_path.clone(),
95 content: f.content.clone().unwrap_or_default(),
96 tokens: f.token_count.get(self.model),
97 truncated: false,
98 })
99 .collect(),
100 context: ChunkContext {
101 previous_summary: None,
102 current_focus: focus,
103 next_preview: None,
104 cross_references: Vec::new(),
105 overlap_content: None,
106 },
107 }
108 }
109
110 pub(crate) fn build_symbol_chunk(
111 &self,
112 index: usize,
113 snippets: &[SymbolSnippet],
114 tokenizer: &Tokenizer,
115 ) -> Chunk {
116 let focus = self.determine_symbol_focus(snippets);
117 let mut by_file: BTreeMap<&str, Vec<&SymbolSnippet>> = BTreeMap::new();
118
119 for snippet in snippets {
120 by_file
121 .entry(snippet.file_path.as_str())
122 .or_default()
123 .push(snippet);
124 }
125
126 let mut files = Vec::new();
127 let mut total_tokens = 0u32;
128
129 for (path, mut entries) in by_file {
130 entries.sort_by(|a, b| {
131 a.start_line
132 .cmp(&b.start_line)
133 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
134 });
135
136 let mut content = String::new();
137 for entry in entries {
138 if !content.is_empty() {
139 content.push_str("\n\n");
140 }
141 content.push_str(&entry.content);
142 }
143
144 let tokens = tokenizer.count(&content, self.model);
145 total_tokens += tokens;
146
147 files.push(ChunkFile { path: path.to_owned(), content, tokens, truncated: false });
148 }
149
150 Chunk {
151 index,
152 total: 0,
153 focus: focus.clone(),
154 tokens: total_tokens,
155 files,
156 context: ChunkContext {
157 previous_summary: None,
158 current_focus: focus,
159 next_preview: None,
160 cross_references: Vec::new(),
161 overlap_content: None,
162 },
163 }
164 }
165
166 fn determine_focus(&self, files: &[RepoFile]) -> String {
171 if files.is_empty() {
172 return "Empty".to_owned();
173 }
174
175 let first_path = &files[0].relative_path;
177 if let Some(module) = first_path.split('/').next() {
178 if files.iter().all(|f| f.relative_path.starts_with(module)) {
179 return format!("{} module", module);
180 }
181 }
182
183 if let Some(lang) = &files[0].language {
185 if files.iter().all(|f| f.language.as_ref() == Some(lang)) {
186 return format!("{} files", lang);
187 }
188 }
189
190 "Mixed content".to_owned()
191 }
192
193 fn determine_focus_refs(&self, files: &[&RepoFile]) -> String {
195 if files.is_empty() {
196 return "Empty".to_owned();
197 }
198
199 let first_path = &files[0].relative_path;
201 if let Some(module) = first_path.split('/').next() {
202 if files.iter().all(|f| f.relative_path.starts_with(module)) {
203 return format!("{} module", module);
204 }
205 }
206
207 if let Some(lang) = &files[0].language {
209 if files.iter().all(|f| f.language.as_ref() == Some(lang)) {
210 return format!("{} files", lang);
211 }
212 }
213
214 "Mixed content".to_owned()
215 }
216
217 fn determine_symbol_focus(&self, snippets: &[SymbolSnippet]) -> String {
218 if snippets.is_empty() {
219 return "Symbols".to_owned();
220 }
221
222 let mut names: Vec<String> = snippets
223 .iter()
224 .take(3)
225 .map(|snippet| snippet.symbol_name.clone())
226 .collect();
227
228 let suffix = if snippets.len() > names.len() {
229 format!(" +{} more", snippets.len() - names.len())
230 } else {
231 String::new()
232 };
233
234 if names.len() == 1 {
235 format!("Symbol: {}{}", names.remove(0), suffix)
236 } else {
237 format!("Symbols: {}{}", names.join(", "), suffix)
238 }
239 }
240
241 pub(crate) fn get_overlap_files(&self, files: &[RepoFile]) -> Vec<RepoFile> {
246 files
249 .last()
250 .filter(|f| f.token_count.get(self.model) < self.overlap_tokens)
251 .cloned()
252 .into_iter()
253 .collect()
254 }
255
256 pub(crate) fn finalize_chunks(&self, mut chunks: Vec<Chunk>, repo: &Repository) -> Vec<Chunk> {
257 let total = chunks.len();
258
259 let focus_strs: Vec<String> = chunks.iter().map(|c| c.focus.clone()).collect();
261
262 let overlap_contents: Vec<Option<String>> = if self.overlap_tokens > 0 {
264 chunks
265 .iter()
266 .map(|chunk| self.extract_overlap_content(chunk))
267 .collect()
268 } else {
269 vec![None; chunks.len()]
270 };
271
272 for (i, chunk) in chunks.iter_mut().enumerate() {
273 chunk.total = total;
274
275 if i > 0 {
277 chunk.context.previous_summary = Some(format!("Previous: {}", focus_strs[i - 1]));
278
279 if let Some(ref overlap) = overlap_contents[i - 1] {
281 chunk.context.overlap_content = Some(format!(
282 "<!-- [OVERLAP FROM PREVIOUS CHUNK] -->\n{}\n<!-- [END OVERLAP] -->",
283 overlap
284 ));
285 }
286 }
287
288 if i + 1 < total {
290 chunk.context.next_preview = Some(format!("Next: Chunk {}", i + 2));
291 }
292 }
293
294 self.populate_cross_references(&mut chunks, repo);
295
296 chunks
297 }
298
299 fn populate_cross_references(&self, chunks: &mut [Chunk], repo: &Repository) {
300 const MAX_REFS: usize = 25;
301
302 #[derive(Clone)]
303 struct SymbolLocation {
304 chunk_index: usize,
305 file: String,
306 }
307
308 let file_lookup: HashMap<&str, &RepoFile> = repo
309 .files
310 .iter()
311 .map(|file| (file.relative_path.as_str(), file))
312 .collect();
313
314 let mut symbol_index: HashMap<String, Vec<SymbolLocation>> = HashMap::new();
315 let mut seen_symbols: HashSet<(String, usize, String)> = HashSet::new();
316
317 for (chunk_index, chunk) in chunks.iter().enumerate() {
318 for chunk_file in &chunk.files {
319 if let Some(repo_file) = file_lookup.get(chunk_file.path.as_str()) {
320 for symbol in &repo_file.symbols {
321 if symbol.kind == SymbolKind::Import {
322 continue;
323 }
324 let key = (symbol.name.clone(), chunk_index, chunk_file.path.clone());
325 if seen_symbols.insert(key) {
326 symbol_index.entry(symbol.name.clone()).or_default().push(
327 SymbolLocation { chunk_index, file: chunk_file.path.clone() },
328 );
329 }
330 }
331 }
332 }
333 }
334
335 for (chunk_index, chunk) in chunks.iter_mut().enumerate() {
336 let mut refs: Vec<CrossReference> = Vec::new();
337 let mut seen_refs: HashSet<(String, usize, String)> = HashSet::new();
338
339 'files: for chunk_file in &chunk.files {
340 if let Some(repo_file) = file_lookup.get(chunk_file.path.as_str()) {
341 for symbol in &repo_file.symbols {
342 for called in &symbol.calls {
343 if let Some(targets) = symbol_index.get(called) {
344 for target in targets {
345 if target.chunk_index == chunk_index {
346 continue;
347 }
348 let key = (
349 called.to_owned(),
350 target.chunk_index,
351 target.file.clone(),
352 );
353 if seen_refs.insert(key) {
354 refs.push(CrossReference {
355 symbol: called.to_owned(),
356 chunk_index: target.chunk_index,
357 file: target.file.clone(),
358 });
359 if refs.len() >= MAX_REFS {
360 break 'files;
361 }
362 }
363 }
364 }
365 }
366
367 if let Some(ref base) = symbol.extends {
368 if let Some(targets) = symbol_index.get(base) {
369 for target in targets {
370 if target.chunk_index == chunk_index {
371 continue;
372 }
373 let key =
374 (base.to_owned(), target.chunk_index, target.file.clone());
375 if seen_refs.insert(key) {
376 refs.push(CrossReference {
377 symbol: base.to_owned(),
378 chunk_index: target.chunk_index,
379 file: target.file.clone(),
380 });
381 if refs.len() >= MAX_REFS {
382 break 'files;
383 }
384 }
385 }
386 }
387 }
388
389 for iface in &symbol.implements {
390 if let Some(targets) = symbol_index.get(iface) {
391 for target in targets {
392 if target.chunk_index == chunk_index {
393 continue;
394 }
395 let key =
396 (iface.to_owned(), target.chunk_index, target.file.clone());
397 if seen_refs.insert(key) {
398 refs.push(CrossReference {
399 symbol: iface.to_owned(),
400 chunk_index: target.chunk_index,
401 file: target.file.clone(),
402 });
403 if refs.len() >= MAX_REFS {
404 break 'files;
405 }
406 }
407 }
408 }
409 }
410 }
411 }
412 }
413
414 refs.sort_by(|a, b| {
415 a.chunk_index
416 .cmp(&b.chunk_index)
417 .then_with(|| a.symbol.cmp(&b.symbol))
418 .then_with(|| a.file.cmp(&b.file))
419 });
420 if refs.len() > MAX_REFS {
421 refs.truncate(MAX_REFS);
422 }
423
424 chunk.context.cross_references = refs;
425 }
426 }
427
428 fn extract_overlap_content(&self, chunk: &Chunk) -> Option<String> {
430 if self.overlap_tokens == 0 || chunk.files.is_empty() {
431 return None;
432 }
433
434 let tokenizer = Tokenizer::new();
435 let mut overlap_parts = Vec::new();
436 let mut remaining_tokens = self.overlap_tokens;
437 let token_model = self.model;
438
439 for file in chunk.files.iter().rev() {
441 if remaining_tokens == 0 {
442 break;
443 }
444
445 let file_tokens = tokenizer.count(&file.content, token_model);
446 if file_tokens <= remaining_tokens {
447 overlap_parts.push(format!("// From: {}\n{}", file.path, file.content));
449 remaining_tokens = remaining_tokens.saturating_sub(file_tokens);
450 } else {
451 let lines: Vec<&str> = file.content.lines().collect();
453 let mut partial_lines = Vec::new();
454 let mut partial_tokens = 0u32;
455
456 for line in lines.iter().rev() {
457 let line_tokens = tokenizer.count(line, token_model);
458 if partial_tokens + line_tokens > remaining_tokens {
459 break;
460 }
461 partial_lines.push(*line);
462 partial_tokens += line_tokens;
463 }
464
465 if !partial_lines.is_empty() {
466 partial_lines.reverse();
467 let partial_content = partial_lines.join("\n");
468 overlap_parts
469 .push(format!("// From: {} (partial)\n{}", file.path, partial_content));
470 }
471 remaining_tokens = 0;
472 }
473 }
474
475 if overlap_parts.is_empty() {
476 None
477 } else {
478 overlap_parts.reverse();
479 Some(overlap_parts.join("\n\n"))
480 }
481 }
482}
483
484#[cfg(test)]
485#[allow(clippy::str_to_string)]
486mod tests {
487 use super::*;
488 use crate::types::{Symbol, SymbolKind, TokenCounts, Visibility};
489
490 fn create_test_repo() -> Repository {
491 let mut repo = Repository::new("test", "/tmp/test");
492
493 for i in 0..5 {
494 repo.files.push(RepoFile {
495 path: format!("/tmp/test/src/file{}.py", i).into(),
496 relative_path: format!("src/file{}.py", i),
497 language: Some("python".to_string()),
498 size_bytes: 1000,
499 token_count: TokenCounts {
500 o200k: 480,
501 cl100k: 490,
502 claude: 500,
503 gemini: 470,
504 llama: 460,
505 mistral: 460,
506 deepseek: 460,
507 qwen: 460,
508 cohere: 465,
509 grok: 460,
510 },
511 symbols: Vec::new(),
512 importance: 0.5,
513 content: Some(format!("# File {}\ndef func{}(): pass", i, i)),
514 });
515 }
516
517 repo
518 }
519
520 fn create_multi_module_repo() -> Repository {
521 let mut repo = Repository::new("test", "/tmp/test");
522
523 for i in 0..3 {
525 repo.files.push(RepoFile {
526 path: format!("/tmp/test/moduleA/file{}.py", i).into(),
527 relative_path: format!("moduleA/file{}.py", i),
528 language: Some("python".to_string()),
529 size_bytes: 500,
530 token_count: TokenCounts::default_with_value(300),
531 symbols: Vec::new(),
532 importance: 0.5,
533 content: Some(format!("# Module A File {}\ndef funcA{}(): pass", i, i)),
534 });
535 }
536
537 for i in 0..2 {
539 repo.files.push(RepoFile {
540 path: format!("/tmp/test/moduleB/file{}.py", i).into(),
541 relative_path: format!("moduleB/file{}.py", i),
542 language: Some("python".to_string()),
543 size_bytes: 500,
544 token_count: TokenCounts::default_with_value(300),
545 symbols: Vec::new(),
546 importance: 0.5,
547 content: Some(format!("# Module B File {}\ndef funcB{}(): pass", i, i)),
548 });
549 }
550
551 repo
552 }
553
554 fn create_repo_with_imports() -> Repository {
555 let mut repo = Repository::new("test", "/tmp/test");
556
557 let mut file_a = RepoFile {
559 path: "/tmp/test/src/utils.py".into(),
560 relative_path: "src/utils.py".to_string(),
561 language: Some("python".to_string()),
562 size_bytes: 500,
563 token_count: TokenCounts::default_with_value(200),
564 symbols: vec![Symbol::new("helper", SymbolKind::Function)],
565 importance: 0.5,
566 content: Some("def helper(): pass".to_string()),
567 };
568 file_a.symbols[0].start_line = 1;
569 file_a.symbols[0].end_line = 1;
570
571 let mut file_b = RepoFile {
573 path: "/tmp/test/src/main.py".into(),
574 relative_path: "src/main.py".to_string(),
575 language: Some("python".to_string()),
576 size_bytes: 500,
577 token_count: TokenCounts::default_with_value(200),
578 symbols: vec![
579 Symbol::new("src/utils", SymbolKind::Import),
580 Symbol::new("main", SymbolKind::Function),
581 ],
582 importance: 0.8,
583 content: Some("from utils import helper\ndef main(): helper()".to_string()),
584 };
585 file_b.symbols[1].start_line = 2;
586 file_b.symbols[1].end_line = 2;
587 file_b.symbols[1].calls = vec!["helper".to_string()];
588
589 repo.files.push(file_a);
590 repo.files.push(file_b);
591
592 repo
593 }
594
595 #[test]
600 fn test_fixed_chunking() {
601 let repo = create_test_repo();
602 let chunker = Chunker::new(ChunkStrategy::Fixed { size: 1000 }, 1000);
603 let chunks = chunker.chunk(&repo);
604
605 assert!(!chunks.is_empty());
606 assert!(chunks
607 .iter()
608 .all(|c| c.tokens <= 1000 || c.files.len() == 1));
609 }
610
611 #[test]
612 fn test_file_chunking() {
613 let repo = create_test_repo();
614 let chunker = Chunker::new(ChunkStrategy::File, 8000);
615 let chunks = chunker.chunk(&repo);
616
617 assert_eq!(chunks.len(), repo.files.len());
618 }
619
620 #[test]
621 fn test_semantic_chunking() {
622 let repo = create_test_repo();
623 let chunker = Chunker::new(ChunkStrategy::Semantic, 2000);
624 let chunks = chunker.chunk(&repo);
625
626 assert!(!chunks.is_empty());
627 assert!(chunks.iter().all(|c| c.total == chunks.len()));
629 }
630
631 #[test]
632 fn test_symbol_chunking() {
633 let mut repo = create_test_repo();
634 if let Some(file) = repo.files.get_mut(0) {
635 let mut symbol = Symbol::new("func0", SymbolKind::Function);
636 symbol.start_line = 1;
637 symbol.end_line = 1;
638 symbol.visibility = Visibility::Public;
639 file.symbols.push(symbol);
640 }
641
642 let chunker = Chunker::new(ChunkStrategy::Symbol, 500);
643 let chunks = chunker.chunk(&repo);
644
645 assert!(!chunks.is_empty());
646 assert!(chunks.iter().all(|c| c.total == chunks.len()));
647 }
648
649 #[test]
654 fn test_module_chunking() {
655 let repo = create_multi_module_repo();
656 let chunker = Chunker::new(ChunkStrategy::Module, 2000);
657 let chunks = chunker.chunk(&repo);
658
659 assert!(!chunks.is_empty());
660 assert!(chunks.iter().all(|c| c.total == chunks.len()));
662 }
663
664 #[test]
665 fn test_module_chunking_respects_max_tokens() {
666 let repo = create_multi_module_repo();
667 let chunker = Chunker::new(ChunkStrategy::Module, 400);
669 let chunks = chunker.chunk(&repo);
670
671 assert!(!chunks.is_empty());
672 for chunk in &chunks {
674 assert!(chunk.tokens <= 400 || chunk.files.len() == 1);
675 }
676 }
677
678 #[test]
679 fn test_module_chunking_large_limit() {
680 let repo = create_multi_module_repo();
681 let chunker = Chunker::new(ChunkStrategy::Module, 10000);
683 let chunks = chunker.chunk(&repo);
684
685 assert_eq!(chunks.len(), 2);
687 }
688
689 #[test]
694 fn test_dependency_chunking() {
695 let repo = create_repo_with_imports();
696 let chunker = Chunker::new(ChunkStrategy::Dependency, 2000);
697 let chunks = chunker.chunk(&repo);
698
699 assert!(!chunks.is_empty());
700 assert!(chunks.iter().all(|c| c.total == chunks.len()));
701 }
702
703 #[test]
704 fn test_dependency_chunking_order() {
705 let repo = create_repo_with_imports();
706 let chunker = Chunker::new(ChunkStrategy::Dependency, 1000);
707 let chunks = chunker.chunk(&repo);
708
709 assert!(!chunks.is_empty());
711 }
712
713 #[test]
714 fn test_dependency_chunking_with_cycles() {
715 let mut repo = Repository::new("test", "/tmp/test");
716
717 let mut file_a = RepoFile {
719 path: "/tmp/test/a.py".into(),
720 relative_path: "a.py".to_string(),
721 language: Some("python".to_string()),
722 size_bytes: 500,
723 token_count: TokenCounts::default_with_value(200),
724 symbols: vec![
725 Symbol::new("b", SymbolKind::Import),
726 Symbol::new("funcA", SymbolKind::Function),
727 ],
728 importance: 0.5,
729 content: Some("from b import funcB\ndef funcA(): funcB()".to_string()),
730 };
731 file_a.symbols[1].calls = vec!["funcB".to_string()];
732
733 let mut file_b = RepoFile {
734 path: "/tmp/test/b.py".into(),
735 relative_path: "b.py".to_string(),
736 language: Some("python".to_string()),
737 size_bytes: 500,
738 token_count: TokenCounts::default_with_value(200),
739 symbols: vec![
740 Symbol::new("a", SymbolKind::Import),
741 Symbol::new("funcB", SymbolKind::Function),
742 ],
743 importance: 0.5,
744 content: Some("from a import funcA\ndef funcB(): funcA()".to_string()),
745 };
746 file_b.symbols[1].calls = vec!["funcA".to_string()];
747
748 repo.files.push(file_a);
749 repo.files.push(file_b);
750
751 let chunker = Chunker::new(ChunkStrategy::Dependency, 1000);
752 let chunks = chunker.chunk(&repo);
753
754 assert!(!chunks.is_empty());
756 let total_files: usize = chunks.iter().map(|c| c.files.len()).sum();
758 assert_eq!(total_files, 2);
759 }
760
761 #[test]
766 fn test_symbol_chunking_no_symbols() {
767 let repo = create_test_repo(); let chunker = Chunker::new(ChunkStrategy::Symbol, 500);
769 let chunks = chunker.chunk(&repo);
770
771 assert!(!chunks.is_empty());
773 }
774
775 #[test]
776 fn test_symbol_chunking_with_imports() {
777 let mut repo = create_test_repo();
778 if let Some(file) = repo.files.get_mut(0) {
780 file.symbols.push(Symbol::new("os", SymbolKind::Import));
781 file.symbols.push(Symbol::new("sys", SymbolKind::Import));
782 let mut func = Symbol::new("func0", SymbolKind::Function);
783 func.start_line = 3;
784 func.end_line = 5;
785 file.symbols.push(func);
786 }
787
788 let chunker = Chunker::new(ChunkStrategy::Symbol, 1000);
789 let chunks = chunker.chunk(&repo);
790
791 assert!(!chunks.is_empty());
792 }
793
794 #[test]
795 fn test_symbol_chunking_multiple_symbols_per_file() {
796 let mut repo = Repository::new("test", "/tmp/test");
797 let mut file = RepoFile {
798 path: "/tmp/test/main.py".into(),
799 relative_path: "main.py".to_string(),
800 language: Some("python".to_string()),
801 size_bytes: 1000,
802 token_count: TokenCounts::default_with_value(500),
803 symbols: Vec::new(),
804 importance: 0.8,
805 content: Some("def func1(): pass\ndef func2(): pass\ndef func3(): pass".to_string()),
806 };
807
808 for i in 1..=3 {
809 let mut sym = Symbol::new(&format!("func{}", i), SymbolKind::Function);
810 sym.start_line = i;
811 sym.end_line = i;
812 sym.importance = 0.9 - (i as f32 * 0.1);
813 file.symbols.push(sym);
814 }
815 repo.files.push(file);
816
817 let chunker = Chunker::new(ChunkStrategy::Symbol, 2000);
818 let chunks = chunker.chunk(&repo);
819
820 assert!(!chunks.is_empty());
821 }
822
823 #[test]
828 fn test_chunker_with_overlap() {
829 let chunker = Chunker::new(ChunkStrategy::Fixed { size: 1000 }, 1000).with_overlap(500);
830 assert_eq!(chunker.overlap_tokens, 500);
831 }
832
833 #[test]
834 fn test_chunker_with_model() {
835 let chunker = Chunker::new(ChunkStrategy::Fixed { size: 1000 }, 1000)
836 .with_model(TokenizerModel::Gpt4o);
837 assert_eq!(chunker.model, TokenizerModel::Gpt4o);
838 }
839
840 #[test]
841 fn test_chunker_builder_chain() {
842 let chunker = Chunker::new(ChunkStrategy::Semantic, 2000)
843 .with_overlap(300)
844 .with_model(TokenizerModel::Gemini);
845
846 assert_eq!(chunker.overlap_tokens, 300);
847 assert_eq!(chunker.model, TokenizerModel::Gemini);
848 assert!(matches!(chunker.strategy, ChunkStrategy::Semantic));
849 }
850
851 #[test]
856 fn test_determine_focus_empty() {
857 let chunker = Chunker::new(ChunkStrategy::File, 1000);
858 let files: Vec<RepoFile> = vec![];
859 let focus = chunker.determine_focus(&files);
860 assert_eq!(focus, "Empty");
861 }
862
863 #[test]
864 fn test_determine_focus_common_module() {
865 let repo = create_multi_module_repo();
866 let chunker = Chunker::new(ChunkStrategy::File, 1000);
867 let module_a_files: Vec<RepoFile> = repo
869 .files
870 .iter()
871 .filter(|f| f.relative_path.starts_with("moduleA"))
872 .cloned()
873 .collect();
874
875 let focus = chunker.determine_focus(&module_a_files);
876 assert!(focus.contains("moduleA"));
877 }
878
879 #[test]
880 fn test_determine_focus_common_language() {
881 let mut repo = Repository::new("test", "/tmp/test");
882 for i in 0..3 {
883 repo.files.push(RepoFile {
884 path: format!("/tmp/test/dir{}/file.rs", i).into(),
885 relative_path: format!("dir{}/file.rs", i),
886 language: Some("rust".to_string()),
887 size_bytes: 500,
888 token_count: TokenCounts::default_with_value(200),
889 symbols: Vec::new(),
890 importance: 0.5,
891 content: Some("fn main() {}".to_string()),
892 });
893 }
894
895 let chunker = Chunker::new(ChunkStrategy::File, 1000);
896 let focus = chunker.determine_focus(&repo.files);
897 assert!(focus.contains("rust") || focus.contains("Mixed"));
898 }
899
900 #[test]
905 fn test_chunk_context_previous_summary() {
906 let repo = create_test_repo();
907 let chunker = Chunker::new(ChunkStrategy::Fixed { size: 600 }, 600);
908 let chunks = chunker.chunk(&repo);
909
910 if chunks.len() > 1 {
911 assert!(chunks[0].context.previous_summary.is_none());
913 assert!(chunks[1].context.previous_summary.is_some());
915 }
916 }
917
918 #[test]
919 fn test_chunk_context_next_preview() {
920 let repo = create_test_repo();
921 let chunker = Chunker::new(ChunkStrategy::Fixed { size: 600 }, 600);
922 let chunks = chunker.chunk(&repo);
923
924 if chunks.len() > 1 {
925 assert!(chunks[0].context.next_preview.is_some());
927 assert!(chunks.last().unwrap().context.next_preview.is_none());
929 }
930 }
931
932 #[test]
937 fn test_extract_overlap_content() {
938 let repo = create_test_repo();
939 let chunker = Chunker::new(ChunkStrategy::Fixed { size: 600 }, 600).with_overlap(100);
940 let chunks = chunker.chunk(&repo);
941
942 if chunks.len() > 1 {
944 assert!(chunks.iter().all(|c| c.total == chunks.len()));
947 }
948 }
949
950 #[test]
951 fn test_no_overlap_when_zero() {
952 let repo = create_test_repo();
953 let chunker = Chunker::new(ChunkStrategy::Fixed { size: 600 }, 600).with_overlap(0);
954 let chunks = chunker.chunk(&repo);
955
956 for chunk in &chunks {
958 assert!(chunk.context.overlap_content.is_none());
959 }
960 }
961
962 #[test]
967 fn test_cross_references_populated() {
968 let repo = create_repo_with_imports();
969 let chunker = Chunker::new(ChunkStrategy::File, 1000);
970 let chunks = chunker.chunk(&repo);
971
972 assert!(!chunks.is_empty());
974 }
975
976 #[test]
981 fn test_fixed_chunking_empty_repo() {
982 let repo = Repository::new("empty", "/tmp/empty");
983 let chunker = Chunker::new(ChunkStrategy::Fixed { size: 1000 }, 1000);
984 let chunks = chunker.chunk(&repo);
985 assert!(chunks.is_empty());
986 }
987
988 #[test]
989 fn test_module_chunking_empty_repo() {
990 let repo = Repository::new("empty", "/tmp/empty");
991 let chunker = Chunker::new(ChunkStrategy::Module, 1000);
992 let chunks = chunker.chunk(&repo);
993 assert!(chunks.is_empty());
994 }
995
996 #[test]
997 fn test_dependency_chunking_empty_repo() {
998 let repo = Repository::new("empty", "/tmp/empty");
999 let chunker = Chunker::new(ChunkStrategy::Dependency, 1000);
1000 let chunks = chunker.chunk(&repo);
1001 assert!(chunks.is_empty());
1002 }
1003
1004 #[test]
1009 fn test_fixed_chunking_single_large_file() {
1010 let mut repo = Repository::new("test", "/tmp/test");
1011 repo.files.push(RepoFile {
1012 path: "/tmp/test/large.py".into(),
1013 relative_path: "large.py".to_string(),
1014 language: Some("python".to_string()),
1015 size_bytes: 50000,
1016 token_count: TokenCounts::default_with_value(10000),
1017 symbols: Vec::new(),
1018 importance: 0.5,
1019 content: Some("x = 1\n".repeat(1000)),
1020 });
1021
1022 let chunker = Chunker::new(ChunkStrategy::Fixed { size: 500 }, 500);
1023 let chunks = chunker.chunk(&repo);
1024
1025 assert!(!chunks.is_empty());
1027 }
1028
1029 #[test]
1034 fn test_chunk_total_is_correct() {
1035 let repo = create_test_repo();
1036 let chunker = Chunker::new(ChunkStrategy::Fixed { size: 600 }, 600);
1037 let chunks = chunker.chunk(&repo);
1038
1039 let expected_total = chunks.len();
1040 for chunk in &chunks {
1041 assert_eq!(chunk.total, expected_total);
1042 }
1043 }
1044
1045 #[test]
1046 fn test_chunk_index_is_sequential() {
1047 let repo = create_test_repo();
1048 let chunker = Chunker::new(ChunkStrategy::Fixed { size: 600 }, 600);
1049 let chunks = chunker.chunk(&repo);
1050
1051 for (i, chunk) in chunks.iter().enumerate() {
1052 assert_eq!(chunk.index, i);
1053 }
1054 }
1055}