1mod strategies;
7mod types;
8
9use types::SymbolSnippet;
10pub use types::{Chunk, ChunkContext, ChunkFile, ChunkStrategy, Chunker, CrossReference};
11
12use crate::tokenizer::Tokenizer;
13use crate::types::{RepoFile, Repository, SymbolKind, TokenizerModel};
14use std::collections::{BTreeMap, HashMap, HashSet};
15
16fn determine_focus_impl<'a>(mut files: impl Iterator<Item = &'a RepoFile>) -> String {
18 let first = match files.next() {
19 Some(f) => f,
20 None => return "Empty".to_owned(),
21 };
22
23 let rest: Vec<&RepoFile> = files.collect();
25
26 if let Some(module) = first.relative_path.split('/').next() {
28 if rest.iter().all(|f| f.relative_path.starts_with(module)) {
29 return format!("{} module", module);
30 }
31 }
32
33 if let Some(lang) = &first.language {
35 if rest.iter().all(|f| f.language.as_ref() == Some(lang)) {
36 return format!("{} files", lang);
37 }
38 }
39
40 "Mixed content".to_owned()
41}
42
43impl Chunker {
44 pub fn new(strategy: ChunkStrategy, max_tokens: u32) -> Self {
46 Self { strategy, max_tokens, overlap_tokens: 200, model: TokenizerModel::Claude }
47 }
48
49 pub fn with_overlap(mut self, tokens: u32) -> Self {
51 self.overlap_tokens = tokens;
52 self
53 }
54
55 pub fn with_model(mut self, model: TokenizerModel) -> Self {
57 self.model = model;
58 self
59 }
60
61 pub fn chunk(&self, repo: &Repository) -> Vec<Chunk> {
63 match self.strategy {
64 ChunkStrategy::Fixed { size } => self.fixed_chunk(repo, size),
65 ChunkStrategy::File => self.file_chunk(repo),
66 ChunkStrategy::Module => self.module_chunk(repo),
67 ChunkStrategy::Symbol => self.symbol_chunk(repo),
68 ChunkStrategy::Semantic => self.semantic_chunk(repo),
69 ChunkStrategy::Dependency => self.dependency_chunk(repo),
70 }
71 }
72
73 pub(crate) fn create_chunk(&self, index: usize, files: &[RepoFile], tokens: u32) -> Chunk {
78 let focus = self.determine_focus(files);
79
80 Chunk {
81 index,
82 total: 0, focus: focus.clone(),
84 tokens,
85 files: files
86 .iter()
87 .map(|f| ChunkFile {
88 path: f.relative_path.clone(),
89 content: f.content.clone().unwrap_or_default(),
90 tokens: f.token_count.get(self.model),
91 truncated: false,
92 })
93 .collect(),
94 context: ChunkContext {
95 previous_summary: None,
96 current_focus: focus,
97 next_preview: None,
98 cross_references: Vec::new(),
99 overlap_content: None,
100 },
101 }
102 }
103
104 pub(crate) fn create_chunk_from_refs(
106 &self,
107 index: usize,
108 files: &[&RepoFile],
109 tokens: u32,
110 ) -> Chunk {
111 let focus = self.determine_focus_refs(files);
112
113 Chunk {
114 index,
115 total: 0, focus: focus.clone(),
117 tokens,
118 files: files
119 .iter()
120 .map(|f| ChunkFile {
121 path: f.relative_path.clone(),
122 content: f.content.clone().unwrap_or_default(),
123 tokens: f.token_count.get(self.model),
124 truncated: false,
125 })
126 .collect(),
127 context: ChunkContext {
128 previous_summary: None,
129 current_focus: focus,
130 next_preview: None,
131 cross_references: Vec::new(),
132 overlap_content: None,
133 },
134 }
135 }
136
137 pub(crate) fn build_symbol_chunk(
138 &self,
139 index: usize,
140 snippets: &[SymbolSnippet],
141 tokenizer: &Tokenizer,
142 ) -> Chunk {
143 let focus = self.determine_symbol_focus(snippets);
144 let mut by_file: BTreeMap<&str, Vec<&SymbolSnippet>> = BTreeMap::new();
145
146 for snippet in snippets {
147 by_file
148 .entry(snippet.file_path.as_str())
149 .or_default()
150 .push(snippet);
151 }
152
153 let mut files = Vec::new();
154 let mut total_tokens = 0u32;
155
156 for (path, mut entries) in by_file {
157 entries.sort_by(|a, b| {
158 a.start_line
159 .cmp(&b.start_line)
160 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
161 });
162
163 let mut content = String::new();
164 for entry in entries {
165 if !content.is_empty() {
166 content.push_str("\n\n");
167 }
168 content.push_str(&entry.content);
169 }
170
171 let tokens = tokenizer.count(&content, self.model);
172 total_tokens += tokens;
173
174 files.push(ChunkFile { path: path.to_owned(), content, tokens, truncated: false });
175 }
176
177 Chunk {
178 index,
179 total: 0,
180 focus: focus.clone(),
181 tokens: total_tokens,
182 files,
183 context: ChunkContext {
184 previous_summary: None,
185 current_focus: focus,
186 next_preview: None,
187 cross_references: Vec::new(),
188 overlap_content: None,
189 },
190 }
191 }
192
193 fn determine_focus(&self, files: &[RepoFile]) -> String {
198 determine_focus_impl(files.iter())
199 }
200
201 fn determine_focus_refs(&self, files: &[&RepoFile]) -> String {
203 determine_focus_impl(files.iter().copied())
204 }
205
206 fn determine_symbol_focus(&self, snippets: &[SymbolSnippet]) -> String {
207 if snippets.is_empty() {
208 return "Symbols".to_owned();
209 }
210
211 let mut names: Vec<String> = snippets
212 .iter()
213 .take(3)
214 .map(|snippet| snippet.symbol_name.clone())
215 .collect();
216
217 let suffix = if snippets.len() > names.len() {
218 format!(" +{} more", snippets.len() - names.len())
219 } else {
220 String::new()
221 };
222
223 if names.len() == 1 {
224 format!("Symbol: {}{}", names.remove(0), suffix)
225 } else {
226 format!("Symbols: {}{}", names.join(", "), suffix)
227 }
228 }
229
230 pub(crate) fn get_overlap_files(&self, files: &[RepoFile]) -> Vec<RepoFile> {
235 files
238 .last()
239 .filter(|f| f.token_count.get(self.model) < self.overlap_tokens)
240 .cloned()
241 .into_iter()
242 .collect()
243 }
244
245 pub(crate) fn finalize_chunks(&self, mut chunks: Vec<Chunk>, repo: &Repository) -> Vec<Chunk> {
246 let total = chunks.len();
247
248 let focus_strs: Vec<String> = chunks.iter().map(|c| c.focus.clone()).collect();
250
251 let overlap_contents: Vec<Option<String>> = if self.overlap_tokens > 0 {
253 chunks
254 .iter()
255 .map(|chunk| self.extract_overlap_content(chunk))
256 .collect()
257 } else {
258 vec![None; chunks.len()]
259 };
260
261 for (i, chunk) in chunks.iter_mut().enumerate() {
262 chunk.total = total;
263
264 if i > 0 {
266 chunk.context.previous_summary = Some(format!("Previous: {}", focus_strs[i - 1]));
267
268 if let Some(ref overlap) = overlap_contents[i - 1] {
270 chunk.context.overlap_content = Some(format!(
271 "<!-- [OVERLAP FROM PREVIOUS CHUNK] -->\n{}\n<!-- [END OVERLAP] -->",
272 overlap
273 ));
274 }
275 }
276
277 if i + 1 < total {
279 chunk.context.next_preview = Some(format!("Next: Chunk {}", i + 2));
280 }
281 }
282
283 self.populate_cross_references(&mut chunks, repo);
284
285 chunks
286 }
287
288 fn populate_cross_references(&self, chunks: &mut [Chunk], repo: &Repository) {
289 const MAX_REFS: usize = 25;
290
291 #[derive(Clone)]
292 struct SymbolLocation {
293 chunk_index: usize,
294 file: String,
295 }
296
297 let file_lookup: HashMap<&str, &RepoFile> = repo
298 .files
299 .iter()
300 .map(|file| (file.relative_path.as_str(), file))
301 .collect();
302
303 let mut symbol_index: HashMap<String, Vec<SymbolLocation>> = HashMap::new();
304 let mut seen_symbols: HashSet<(String, usize, String)> = HashSet::new();
305
306 for (chunk_index, chunk) in chunks.iter().enumerate() {
307 for chunk_file in &chunk.files {
308 if let Some(repo_file) = file_lookup.get(chunk_file.path.as_str()) {
309 for symbol in &repo_file.symbols {
310 if symbol.kind == SymbolKind::Import {
311 continue;
312 }
313 let key = (symbol.name.clone(), chunk_index, chunk_file.path.clone());
314 if seen_symbols.insert(key) {
315 symbol_index.entry(symbol.name.clone()).or_default().push(
316 SymbolLocation { chunk_index, file: chunk_file.path.clone() },
317 );
318 }
319 }
320 }
321 }
322 }
323
324 for (chunk_index, chunk) in chunks.iter_mut().enumerate() {
325 let mut refs: Vec<CrossReference> = Vec::new();
326 let mut seen_refs: HashSet<(String, usize, String)> = HashSet::new();
327
328 'files: for chunk_file in &chunk.files {
329 if let Some(repo_file) = file_lookup.get(chunk_file.path.as_str()) {
330 for symbol in &repo_file.symbols {
331 for called in &symbol.calls {
332 if let Some(targets) = symbol_index.get(called) {
333 for target in targets {
334 if target.chunk_index == chunk_index {
335 continue;
336 }
337 let key = (
338 called.to_owned(),
339 target.chunk_index,
340 target.file.clone(),
341 );
342 if seen_refs.insert(key) {
343 refs.push(CrossReference {
344 symbol: called.to_owned(),
345 chunk_index: target.chunk_index,
346 file: target.file.clone(),
347 });
348 if refs.len() >= MAX_REFS {
349 break 'files;
350 }
351 }
352 }
353 }
354 }
355
356 if let Some(ref base) = symbol.extends {
357 if let Some(targets) = symbol_index.get(base) {
358 for target in targets {
359 if target.chunk_index == chunk_index {
360 continue;
361 }
362 let key =
363 (base.to_owned(), target.chunk_index, target.file.clone());
364 if seen_refs.insert(key) {
365 refs.push(CrossReference {
366 symbol: base.to_owned(),
367 chunk_index: target.chunk_index,
368 file: target.file.clone(),
369 });
370 if refs.len() >= MAX_REFS {
371 break 'files;
372 }
373 }
374 }
375 }
376 }
377
378 for iface in &symbol.implements {
379 if let Some(targets) = symbol_index.get(iface) {
380 for target in targets {
381 if target.chunk_index == chunk_index {
382 continue;
383 }
384 let key =
385 (iface.to_owned(), target.chunk_index, target.file.clone());
386 if seen_refs.insert(key) {
387 refs.push(CrossReference {
388 symbol: iface.to_owned(),
389 chunk_index: target.chunk_index,
390 file: target.file.clone(),
391 });
392 if refs.len() >= MAX_REFS {
393 break 'files;
394 }
395 }
396 }
397 }
398 }
399 }
400 }
401 }
402
403 refs.sort_by(|a, b| {
404 a.chunk_index
405 .cmp(&b.chunk_index)
406 .then_with(|| a.symbol.cmp(&b.symbol))
407 .then_with(|| a.file.cmp(&b.file))
408 });
409 if refs.len() > MAX_REFS {
410 refs.truncate(MAX_REFS);
411 }
412
413 chunk.context.cross_references = refs;
414 }
415 }
416
417 fn extract_overlap_content(&self, chunk: &Chunk) -> Option<String> {
419 if self.overlap_tokens == 0 || chunk.files.is_empty() {
420 return None;
421 }
422
423 let tokenizer = Tokenizer::new();
424 let mut overlap_parts = Vec::new();
425 let mut remaining_tokens = self.overlap_tokens;
426 let token_model = self.model;
427
428 for file in chunk.files.iter().rev() {
430 if remaining_tokens == 0 {
431 break;
432 }
433
434 let file_tokens = tokenizer.count(&file.content, token_model);
435 if file_tokens <= remaining_tokens {
436 overlap_parts.push(format!("// From: {}\n{}", file.path, file.content));
438 remaining_tokens = remaining_tokens.saturating_sub(file_tokens);
439 } else {
440 let lines: Vec<&str> = file.content.lines().collect();
442 let mut partial_lines = Vec::new();
443 let mut partial_tokens = 0u32;
444
445 for line in lines.iter().rev() {
446 let line_tokens = tokenizer.count(line, token_model);
447 if partial_tokens + line_tokens > remaining_tokens {
448 break;
449 }
450 partial_lines.push(*line);
451 partial_tokens += line_tokens;
452 }
453
454 if !partial_lines.is_empty() {
455 partial_lines.reverse();
456 let partial_content = partial_lines.join("\n");
457 overlap_parts
458 .push(format!("// From: {} (partial)\n{}", file.path, partial_content));
459 }
460 remaining_tokens = 0;
461 }
462 }
463
464 if overlap_parts.is_empty() {
465 None
466 } else {
467 overlap_parts.reverse();
468 Some(overlap_parts.join("\n\n"))
469 }
470 }
471}
472
473#[cfg(test)]
474#[allow(clippy::str_to_string)]
475mod tests {
476 use super::*;
477 use crate::types::{Symbol, SymbolKind, TokenCounts, Visibility};
478
479 fn create_test_repo() -> Repository {
480 let mut repo = Repository::new("test", "/tmp/test");
481
482 for i in 0..5 {
483 repo.files.push(RepoFile {
484 path: format!("/tmp/test/src/file{}.py", i).into(),
485 relative_path: format!("src/file{}.py", i),
486 language: Some("python".to_string()),
487 size_bytes: 1000,
488 token_count: TokenCounts {
489 o200k: 480,
490 cl100k: 490,
491 claude: 500,
492 gemini: 470,
493 llama: 460,
494 mistral: 460,
495 deepseek: 460,
496 qwen: 460,
497 cohere: 465,
498 grok: 460,
499 },
500 symbols: Vec::new(),
501 importance: 0.5,
502 content: Some(format!("# File {}\ndef func{}(): pass", i, i)),
503 });
504 }
505
506 repo
507 }
508
509 fn create_multi_module_repo() -> Repository {
510 let mut repo = Repository::new("test", "/tmp/test");
511
512 for i in 0..3 {
514 repo.files.push(RepoFile {
515 path: format!("/tmp/test/moduleA/file{}.py", i).into(),
516 relative_path: format!("moduleA/file{}.py", i),
517 language: Some("python".to_string()),
518 size_bytes: 500,
519 token_count: TokenCounts::default_with_value(300),
520 symbols: Vec::new(),
521 importance: 0.5,
522 content: Some(format!("# Module A File {}\ndef funcA{}(): pass", i, i)),
523 });
524 }
525
526 for i in 0..2 {
528 repo.files.push(RepoFile {
529 path: format!("/tmp/test/moduleB/file{}.py", i).into(),
530 relative_path: format!("moduleB/file{}.py", i),
531 language: Some("python".to_string()),
532 size_bytes: 500,
533 token_count: TokenCounts::default_with_value(300),
534 symbols: Vec::new(),
535 importance: 0.5,
536 content: Some(format!("# Module B File {}\ndef funcB{}(): pass", i, i)),
537 });
538 }
539
540 repo
541 }
542
543 fn create_repo_with_imports() -> Repository {
544 let mut repo = Repository::new("test", "/tmp/test");
545
546 let mut file_a = RepoFile {
548 path: "/tmp/test/src/utils.py".into(),
549 relative_path: "src/utils.py".to_string(),
550 language: Some("python".to_string()),
551 size_bytes: 500,
552 token_count: TokenCounts::default_with_value(200),
553 symbols: vec![Symbol::new("helper", SymbolKind::Function)],
554 importance: 0.5,
555 content: Some("def helper(): pass".to_string()),
556 };
557 file_a.symbols[0].start_line = 1;
558 file_a.symbols[0].end_line = 1;
559
560 let mut file_b = RepoFile {
562 path: "/tmp/test/src/main.py".into(),
563 relative_path: "src/main.py".to_string(),
564 language: Some("python".to_string()),
565 size_bytes: 500,
566 token_count: TokenCounts::default_with_value(200),
567 symbols: vec![
568 Symbol::new("src/utils", SymbolKind::Import),
569 Symbol::new("main", SymbolKind::Function),
570 ],
571 importance: 0.8,
572 content: Some("from utils import helper\ndef main(): helper()".to_string()),
573 };
574 file_b.symbols[1].start_line = 2;
575 file_b.symbols[1].end_line = 2;
576 file_b.symbols[1].calls = vec!["helper".to_string()];
577
578 repo.files.push(file_a);
579 repo.files.push(file_b);
580
581 repo
582 }
583
584 #[test]
589 fn test_fixed_chunking() {
590 let repo = create_test_repo();
591 let chunker = Chunker::new(ChunkStrategy::Fixed { size: 1000 }, 1000);
592 let chunks = chunker.chunk(&repo);
593
594 assert!(!chunks.is_empty());
595 assert!(chunks
596 .iter()
597 .all(|c| c.tokens <= 1000 || c.files.len() == 1));
598 }
599
600 #[test]
601 fn test_file_chunking() {
602 let repo = create_test_repo();
603 let chunker = Chunker::new(ChunkStrategy::File, 8000);
604 let chunks = chunker.chunk(&repo);
605
606 assert_eq!(chunks.len(), repo.files.len());
607 }
608
609 #[test]
610 fn test_semantic_chunking() {
611 let repo = create_test_repo();
612 let chunker = Chunker::new(ChunkStrategy::Semantic, 2000);
613 let chunks = chunker.chunk(&repo);
614
615 assert!(!chunks.is_empty());
616 assert!(chunks.iter().all(|c| c.total == chunks.len()));
618 }
619
620 #[test]
621 fn test_symbol_chunking() {
622 let mut repo = create_test_repo();
623 if let Some(file) = repo.files.get_mut(0) {
624 let mut symbol = Symbol::new("func0", SymbolKind::Function);
625 symbol.start_line = 1;
626 symbol.end_line = 1;
627 symbol.visibility = Visibility::Public;
628 file.symbols.push(symbol);
629 }
630
631 let chunker = Chunker::new(ChunkStrategy::Symbol, 500);
632 let chunks = chunker.chunk(&repo);
633
634 assert!(!chunks.is_empty());
635 assert!(chunks.iter().all(|c| c.total == chunks.len()));
636 }
637
638 #[test]
643 fn test_module_chunking() {
644 let repo = create_multi_module_repo();
645 let chunker = Chunker::new(ChunkStrategy::Module, 2000);
646 let chunks = chunker.chunk(&repo);
647
648 assert!(!chunks.is_empty());
649 assert!(chunks.iter().all(|c| c.total == chunks.len()));
651 }
652
653 #[test]
654 fn test_module_chunking_respects_max_tokens() {
655 let repo = create_multi_module_repo();
656 let chunker = Chunker::new(ChunkStrategy::Module, 400);
658 let chunks = chunker.chunk(&repo);
659
660 assert!(!chunks.is_empty());
661 for chunk in &chunks {
663 assert!(chunk.tokens <= 400 || chunk.files.len() == 1);
664 }
665 }
666
667 #[test]
668 fn test_module_chunking_large_limit() {
669 let repo = create_multi_module_repo();
670 let chunker = Chunker::new(ChunkStrategy::Module, 10000);
672 let chunks = chunker.chunk(&repo);
673
674 assert_eq!(chunks.len(), 2);
676 }
677
678 #[test]
683 fn test_dependency_chunking() {
684 let repo = create_repo_with_imports();
685 let chunker = Chunker::new(ChunkStrategy::Dependency, 2000);
686 let chunks = chunker.chunk(&repo);
687
688 assert!(!chunks.is_empty());
689 assert!(chunks.iter().all(|c| c.total == chunks.len()));
690 }
691
692 #[test]
693 fn test_dependency_chunking_order() {
694 let repo = create_repo_with_imports();
695 let chunker = Chunker::new(ChunkStrategy::Dependency, 1000);
696 let chunks = chunker.chunk(&repo);
697
698 assert!(!chunks.is_empty());
700 }
701
702 #[test]
703 fn test_dependency_chunking_with_cycles() {
704 let mut repo = Repository::new("test", "/tmp/test");
705
706 let mut file_a = RepoFile {
708 path: "/tmp/test/a.py".into(),
709 relative_path: "a.py".to_string(),
710 language: Some("python".to_string()),
711 size_bytes: 500,
712 token_count: TokenCounts::default_with_value(200),
713 symbols: vec![
714 Symbol::new("b", SymbolKind::Import),
715 Symbol::new("funcA", SymbolKind::Function),
716 ],
717 importance: 0.5,
718 content: Some("from b import funcB\ndef funcA(): funcB()".to_string()),
719 };
720 file_a.symbols[1].calls = vec!["funcB".to_string()];
721
722 let mut file_b = RepoFile {
723 path: "/tmp/test/b.py".into(),
724 relative_path: "b.py".to_string(),
725 language: Some("python".to_string()),
726 size_bytes: 500,
727 token_count: TokenCounts::default_with_value(200),
728 symbols: vec![
729 Symbol::new("a", SymbolKind::Import),
730 Symbol::new("funcB", SymbolKind::Function),
731 ],
732 importance: 0.5,
733 content: Some("from a import funcA\ndef funcB(): funcA()".to_string()),
734 };
735 file_b.symbols[1].calls = vec!["funcA".to_string()];
736
737 repo.files.push(file_a);
738 repo.files.push(file_b);
739
740 let chunker = Chunker::new(ChunkStrategy::Dependency, 1000);
741 let chunks = chunker.chunk(&repo);
742
743 assert!(!chunks.is_empty());
745 let total_files: usize = chunks.iter().map(|c| c.files.len()).sum();
747 assert_eq!(total_files, 2);
748 }
749
750 #[test]
755 fn test_symbol_chunking_no_symbols() {
756 let repo = create_test_repo(); let chunker = Chunker::new(ChunkStrategy::Symbol, 500);
758 let chunks = chunker.chunk(&repo);
759
760 assert!(!chunks.is_empty());
762 }
763
764 #[test]
765 fn test_symbol_chunking_with_imports() {
766 let mut repo = create_test_repo();
767 if let Some(file) = repo.files.get_mut(0) {
769 file.symbols.push(Symbol::new("os", SymbolKind::Import));
770 file.symbols.push(Symbol::new("sys", SymbolKind::Import));
771 let mut func = Symbol::new("func0", SymbolKind::Function);
772 func.start_line = 3;
773 func.end_line = 5;
774 file.symbols.push(func);
775 }
776
777 let chunker = Chunker::new(ChunkStrategy::Symbol, 1000);
778 let chunks = chunker.chunk(&repo);
779
780 assert!(!chunks.is_empty());
781 }
782
783 #[test]
784 fn test_symbol_chunking_multiple_symbols_per_file() {
785 let mut repo = Repository::new("test", "/tmp/test");
786 let mut file = RepoFile {
787 path: "/tmp/test/main.py".into(),
788 relative_path: "main.py".to_string(),
789 language: Some("python".to_string()),
790 size_bytes: 1000,
791 token_count: TokenCounts::default_with_value(500),
792 symbols: Vec::new(),
793 importance: 0.8,
794 content: Some("def func1(): pass\ndef func2(): pass\ndef func3(): pass".to_string()),
795 };
796
797 for i in 1..=3 {
798 let mut sym = Symbol::new(&format!("func{}", i), SymbolKind::Function);
799 sym.start_line = i;
800 sym.end_line = i;
801 sym.importance = 0.9 - (i as f32 * 0.1);
802 file.symbols.push(sym);
803 }
804 repo.files.push(file);
805
806 let chunker = Chunker::new(ChunkStrategy::Symbol, 2000);
807 let chunks = chunker.chunk(&repo);
808
809 assert!(!chunks.is_empty());
810 }
811
812 #[test]
817 fn test_chunker_with_overlap() {
818 let chunker = Chunker::new(ChunkStrategy::Fixed { size: 1000 }, 1000).with_overlap(500);
819 assert_eq!(chunker.overlap_tokens, 500);
820 }
821
822 #[test]
823 fn test_chunker_with_model() {
824 let chunker = Chunker::new(ChunkStrategy::Fixed { size: 1000 }, 1000)
825 .with_model(TokenizerModel::Gpt4o);
826 assert_eq!(chunker.model, TokenizerModel::Gpt4o);
827 }
828
829 #[test]
830 fn test_chunker_builder_chain() {
831 let chunker = Chunker::new(ChunkStrategy::Semantic, 2000)
832 .with_overlap(300)
833 .with_model(TokenizerModel::Gemini);
834
835 assert_eq!(chunker.overlap_tokens, 300);
836 assert_eq!(chunker.model, TokenizerModel::Gemini);
837 assert!(matches!(chunker.strategy, ChunkStrategy::Semantic));
838 }
839
840 #[test]
845 fn test_determine_focus_empty() {
846 let chunker = Chunker::new(ChunkStrategy::File, 1000);
847 let files: Vec<RepoFile> = vec![];
848 let focus = chunker.determine_focus(&files);
849 assert_eq!(focus, "Empty");
850 }
851
852 #[test]
853 fn test_determine_focus_common_module() {
854 let repo = create_multi_module_repo();
855 let chunker = Chunker::new(ChunkStrategy::File, 1000);
856 let module_a_files: Vec<RepoFile> = repo
858 .files
859 .iter()
860 .filter(|f| f.relative_path.starts_with("moduleA"))
861 .cloned()
862 .collect();
863
864 let focus = chunker.determine_focus(&module_a_files);
865 assert!(focus.contains("moduleA"));
866 }
867
868 #[test]
869 fn test_determine_focus_common_language() {
870 let mut repo = Repository::new("test", "/tmp/test");
871 for i in 0..3 {
872 repo.files.push(RepoFile {
873 path: format!("/tmp/test/dir{}/file.rs", i).into(),
874 relative_path: format!("dir{}/file.rs", i),
875 language: Some("rust".to_string()),
876 size_bytes: 500,
877 token_count: TokenCounts::default_with_value(200),
878 symbols: Vec::new(),
879 importance: 0.5,
880 content: Some("fn main() {}".to_string()),
881 });
882 }
883
884 let chunker = Chunker::new(ChunkStrategy::File, 1000);
885 let focus = chunker.determine_focus(&repo.files);
886 assert!(focus.contains("rust") || focus.contains("Mixed"));
887 }
888
889 #[test]
894 fn test_chunk_context_previous_summary() {
895 let repo = create_test_repo();
896 let chunker = Chunker::new(ChunkStrategy::Fixed { size: 600 }, 600);
897 let chunks = chunker.chunk(&repo);
898
899 if chunks.len() > 1 {
900 assert!(chunks[0].context.previous_summary.is_none());
902 assert!(chunks[1].context.previous_summary.is_some());
904 }
905 }
906
907 #[test]
908 fn test_chunk_context_next_preview() {
909 let repo = create_test_repo();
910 let chunker = Chunker::new(ChunkStrategy::Fixed { size: 600 }, 600);
911 let chunks = chunker.chunk(&repo);
912
913 if chunks.len() > 1 {
914 assert!(chunks[0].context.next_preview.is_some());
916 assert!(chunks.last().unwrap().context.next_preview.is_none());
918 }
919 }
920
921 #[test]
926 fn test_extract_overlap_content() {
927 let repo = create_test_repo();
928 let chunker = Chunker::new(ChunkStrategy::Fixed { size: 600 }, 600).with_overlap(100);
929 let chunks = chunker.chunk(&repo);
930
931 if chunks.len() > 1 {
933 assert!(chunks.iter().all(|c| c.total == chunks.len()));
936 }
937 }
938
939 #[test]
940 fn test_no_overlap_when_zero() {
941 let repo = create_test_repo();
942 let chunker = Chunker::new(ChunkStrategy::Fixed { size: 600 }, 600).with_overlap(0);
943 let chunks = chunker.chunk(&repo);
944
945 for chunk in &chunks {
947 assert!(chunk.context.overlap_content.is_none());
948 }
949 }
950
951 #[test]
956 fn test_cross_references_populated() {
957 let repo = create_repo_with_imports();
958 let chunker = Chunker::new(ChunkStrategy::File, 1000);
959 let chunks = chunker.chunk(&repo);
960
961 assert!(!chunks.is_empty());
963 }
964
965 #[test]
970 fn test_fixed_chunking_empty_repo() {
971 let repo = Repository::new("empty", "/tmp/empty");
972 let chunker = Chunker::new(ChunkStrategy::Fixed { size: 1000 }, 1000);
973 let chunks = chunker.chunk(&repo);
974 assert!(chunks.is_empty());
975 }
976
977 #[test]
978 fn test_module_chunking_empty_repo() {
979 let repo = Repository::new("empty", "/tmp/empty");
980 let chunker = Chunker::new(ChunkStrategy::Module, 1000);
981 let chunks = chunker.chunk(&repo);
982 assert!(chunks.is_empty());
983 }
984
985 #[test]
986 fn test_dependency_chunking_empty_repo() {
987 let repo = Repository::new("empty", "/tmp/empty");
988 let chunker = Chunker::new(ChunkStrategy::Dependency, 1000);
989 let chunks = chunker.chunk(&repo);
990 assert!(chunks.is_empty());
991 }
992
993 #[test]
998 fn test_fixed_chunking_single_large_file() {
999 let mut repo = Repository::new("test", "/tmp/test");
1000 repo.files.push(RepoFile {
1001 path: "/tmp/test/large.py".into(),
1002 relative_path: "large.py".to_string(),
1003 language: Some("python".to_string()),
1004 size_bytes: 50000,
1005 token_count: TokenCounts::default_with_value(10000),
1006 symbols: Vec::new(),
1007 importance: 0.5,
1008 content: Some("x = 1\n".repeat(1000)),
1009 });
1010
1011 let chunker = Chunker::new(ChunkStrategy::Fixed { size: 500 }, 500);
1012 let chunks = chunker.chunk(&repo);
1013
1014 assert!(!chunks.is_empty());
1016 }
1017
1018 #[test]
1023 fn test_chunk_total_is_correct() {
1024 let repo = create_test_repo();
1025 let chunker = Chunker::new(ChunkStrategy::Fixed { size: 600 }, 600);
1026 let chunks = chunker.chunk(&repo);
1027
1028 let expected_total = chunks.len();
1029 for chunk in &chunks {
1030 assert_eq!(chunk.total, expected_total);
1031 }
1032 }
1033
1034 #[test]
1035 fn test_chunk_index_is_sequential() {
1036 let repo = create_test_repo();
1037 let chunker = Chunker::new(ChunkStrategy::Fixed { size: 600 }, 600);
1038 let chunks = chunker.chunk(&repo);
1039
1040 for (i, chunk) in chunks.iter().enumerate() {
1041 assert_eq!(chunk.index, i);
1042 }
1043 }
1044}