1use crate::types::{RepoFile, Repository, SymbolKind, TokenizerModel};
4use serde::Serialize;
5
6#[derive(Debug, Clone, Serialize)]
8pub struct Chunk {
9 pub index: usize,
11 pub total: usize,
13 pub focus: String,
15 pub tokens: u32,
17 pub files: Vec<ChunkFile>,
19 pub context: ChunkContext,
21}
22
23#[derive(Debug, Clone, Serialize)]
25pub struct ChunkFile {
26 pub path: String,
28 pub content: String,
30 pub tokens: u32,
32 pub truncated: bool,
34}
35
36#[derive(Debug, Clone, Serialize)]
38pub struct ChunkContext {
39 pub previous_summary: Option<String>,
41 pub current_focus: String,
43 pub next_preview: Option<String>,
45 pub cross_references: Vec<CrossReference>,
47 pub overlap_content: Option<String>,
49}
50
51#[derive(Debug, Clone, Serialize)]
53pub struct CrossReference {
54 pub symbol: String,
56 pub chunk_index: usize,
58 pub file: String,
60}
61
62#[derive(Debug, Clone)]
63struct SymbolSnippet {
64 file_path: String,
65 symbol_name: String,
66 start_line: u32,
67 content: String,
68 tokens: u32,
69 importance: f32,
70}
71
72#[derive(Debug, Clone, Copy, Default)]
74pub enum ChunkStrategy {
75 Fixed {
77 size: u32,
79 },
80 File,
82 Module,
84 Symbol,
86 #[default]
88 Semantic,
89 Dependency,
91}
92
93pub struct Chunker {
95 strategy: ChunkStrategy,
97 max_tokens: u32,
99 overlap_tokens: u32,
101 model: TokenizerModel,
103}
104
105impl Chunker {
106 pub fn new(strategy: ChunkStrategy, max_tokens: u32) -> Self {
108 Self { strategy, max_tokens, overlap_tokens: 200, model: TokenizerModel::Claude }
109 }
110
111 pub fn with_overlap(mut self, tokens: u32) -> Self {
113 self.overlap_tokens = tokens;
114 self
115 }
116
117 pub fn with_model(mut self, model: TokenizerModel) -> Self {
119 self.model = model;
120 self
121 }
122
123 pub fn chunk(&self, repo: &Repository) -> Vec<Chunk> {
125 match self.strategy {
126 ChunkStrategy::Fixed { size } => self.fixed_chunk(repo, size),
127 ChunkStrategy::File => self.file_chunk(repo),
128 ChunkStrategy::Module => self.module_chunk(repo),
129 ChunkStrategy::Symbol => self.symbol_chunk(repo),
130 ChunkStrategy::Semantic => self.semantic_chunk(repo),
131 ChunkStrategy::Dependency => self.dependency_chunk(repo),
132 }
133 }
134
135 fn fixed_chunk(&self, repo: &Repository, size: u32) -> Vec<Chunk> {
137 let mut chunks = Vec::new();
138 let mut current_files = Vec::new();
139 let mut current_tokens = 0u32;
140
141 for file in &repo.files {
142 let file_tokens = file.token_count.get(self.model);
143
144 if current_tokens + file_tokens > size && !current_files.is_empty() {
145 chunks.push(self.create_chunk(chunks.len(), ¤t_files, current_tokens));
146 current_files.clear();
147 current_tokens = 0;
148 }
149
150 current_files.push(file.clone());
151 current_tokens += file_tokens;
152 }
153
154 if !current_files.is_empty() {
155 chunks.push(self.create_chunk(chunks.len(), ¤t_files, current_tokens));
156 }
157
158 self.finalize_chunks(chunks, repo)
159 }
160
161 fn file_chunk(&self, repo: &Repository) -> Vec<Chunk> {
163 let chunks: Vec<_> = repo
164 .files
165 .iter()
166 .enumerate()
167 .map(|(i, file)| {
168 self.create_chunk(i, std::slice::from_ref(file), file.token_count.get(self.model))
169 })
170 .collect();
171
172 self.finalize_chunks(chunks, repo)
173 }
174
175 fn module_chunk(&self, repo: &Repository) -> Vec<Chunk> {
177 use std::collections::HashMap;
178
179 let mut modules: HashMap<String, Vec<RepoFile>> = HashMap::new();
180
181 for file in &repo.files {
182 let module = file
183 .relative_path
184 .split('/')
185 .next()
186 .unwrap_or("root")
187 .to_owned();
188
189 modules.entry(module).or_default().push(file.clone());
190 }
191
192 let mut sorted_modules: Vec<_> = modules.into_iter().collect();
194 sorted_modules.sort_by(|a, b| a.0.cmp(&b.0));
195
196 let mut chunks = Vec::new();
197
198 for (_module_name, mut files) in sorted_modules {
199 files.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
201
202 let module_tokens: u32 = files.iter().map(|f| f.token_count.get(self.model)).sum();
203
204 if module_tokens <= self.max_tokens {
205 chunks.push(self.create_chunk(chunks.len(), &files, module_tokens));
207 } else {
208 let mut current_files = Vec::new();
210 let mut current_tokens = 0u32;
211
212 for file in files {
213 let file_tokens = file.token_count.get(self.model);
214
215 if current_tokens + file_tokens > self.max_tokens && !current_files.is_empty() {
217 chunks.push(self.create_chunk(
218 chunks.len(),
219 ¤t_files,
220 current_tokens,
221 ));
222 current_files = Vec::new();
223 current_tokens = 0;
224 }
225
226 current_files.push(file);
228 current_tokens += file_tokens;
229 }
230
231 if !current_files.is_empty() {
233 chunks.push(self.create_chunk(chunks.len(), ¤t_files, current_tokens));
234 }
235 }
236 }
237
238 self.finalize_chunks(chunks, repo)
239 }
240
241 fn symbol_chunk(&self, repo: &Repository) -> Vec<Chunk> {
243 use crate::tokenizer::Tokenizer;
244
245 const CONTEXT_LINES: u32 = 2;
246 let tokenizer = Tokenizer::new();
247 let mut snippets: Vec<SymbolSnippet> = Vec::new();
248
249 for file in &repo.files {
250 let content = match &file.content {
251 Some(content) => content,
252 None => continue,
253 };
254
255 let lines: Vec<&str> = content.lines().collect();
256 let total_lines = lines.len() as u32;
257 if total_lines == 0 {
258 continue;
259 }
260
261 for symbol in &file.symbols {
262 if symbol.kind == SymbolKind::Import {
263 continue;
264 }
265
266 let snippet_content = if symbol.start_line > 0
267 && symbol.end_line >= symbol.start_line
268 && symbol.start_line <= total_lines
269 {
270 let start = symbol.start_line.saturating_sub(CONTEXT_LINES).max(1);
271 let end = symbol
272 .end_line
273 .max(symbol.start_line)
274 .saturating_add(CONTEXT_LINES)
275 .min(total_lines);
276 let start_idx = start.saturating_sub(1) as usize;
277 let end_idx = end.saturating_sub(1) as usize;
278 if start_idx > end_idx || end_idx >= lines.len() {
279 continue;
280 }
281
282 let mut snippet = String::new();
283 snippet.push_str(&format!(
284 "// {}: {} (lines {}-{})\n",
285 symbol.kind.name(),
286 symbol.name,
287 start,
288 end
289 ));
290 snippet.push_str(&lines[start_idx..=end_idx].join("\n"));
291 snippet
292 } else if let Some(ref sig) = symbol.signature {
293 format!("// {}: {}\n{}", symbol.kind.name(), symbol.name, sig.trim())
294 } else {
295 continue;
296 };
297
298 let tokens = tokenizer.count(&snippet_content, self.model);
299 let importance = (symbol.importance * 0.7) + (file.importance * 0.3);
300
301 snippets.push(SymbolSnippet {
302 file_path: file.relative_path.clone(),
303 symbol_name: symbol.name.clone(),
304 start_line: symbol.start_line,
305 content: snippet_content,
306 tokens,
307 importance,
308 });
309 }
310 }
311
312 if snippets.is_empty() {
313 return self.semantic_chunk(repo);
314 }
315
316 snippets.sort_by(|a, b| {
317 b.importance
318 .partial_cmp(&a.importance)
319 .unwrap_or(std::cmp::Ordering::Equal)
320 .then_with(|| a.tokens.cmp(&b.tokens))
321 .then_with(|| a.file_path.cmp(&b.file_path))
322 });
323
324 let mut chunks: Vec<Chunk> = Vec::new();
325 let mut current: Vec<SymbolSnippet> = Vec::new();
326 let mut current_tokens = 0u32;
327
328 for snippet in snippets {
329 if current_tokens + snippet.tokens > self.max_tokens && !current.is_empty() {
330 chunks.push(self.build_symbol_chunk(chunks.len(), ¤t, &tokenizer));
331 current.clear();
332 current_tokens = 0;
333 }
334
335 current_tokens += snippet.tokens;
336 current.push(snippet);
337 }
338
339 if !current.is_empty() {
340 chunks.push(self.build_symbol_chunk(chunks.len(), ¤t, &tokenizer));
341 }
342
343 self.finalize_chunks(chunks, repo)
344 }
345
346 fn semantic_chunk(&self, repo: &Repository) -> Vec<Chunk> {
348 let mut chunks = Vec::new();
349 let mut current_files = Vec::new();
350 let mut current_tokens = 0u32;
351 let mut current_module: Option<String> = None;
352
353 let mut sorted_files: Vec<_> = repo.files.iter().collect();
355 sorted_files.sort_by(|a, b| a.relative_path.cmp(&b.relative_path));
356
357 for file in sorted_files {
358 let file_tokens = file.token_count.get(self.model);
359 let file_module = file.relative_path.split('/').next().map(String::from);
360
361 let should_split = current_tokens + file_tokens > self.max_tokens
363 || (current_module.is_some()
364 && file_module.is_some()
365 && current_module != file_module
366 && current_tokens > self.max_tokens / 2);
367
368 if should_split && !current_files.is_empty() {
369 chunks.push(self.create_chunk(chunks.len(), ¤t_files, current_tokens));
370
371 current_files = self.get_overlap_files(¤t_files);
373 current_tokens = current_files
374 .iter()
375 .map(|f| f.token_count.get(self.model))
376 .sum();
377 }
378
379 current_files.push(file.clone());
380 current_tokens += file_tokens;
381 current_module = file_module;
382 }
383
384 if !current_files.is_empty() {
385 chunks.push(self.create_chunk(chunks.len(), ¤t_files, current_tokens));
386 }
387
388 self.finalize_chunks(chunks, repo)
389 }
390
391 fn dependency_chunk(&self, repo: &Repository) -> Vec<Chunk> {
394 use std::collections::{HashMap, HashSet, VecDeque};
395
396 let file_indices: HashMap<&str, usize> = repo
398 .files
399 .iter()
400 .enumerate()
401 .map(|(i, f)| (f.relative_path.as_str(), i))
402 .collect();
403
404 let mut imports_from: Vec<HashSet<usize>> = vec![HashSet::new(); repo.files.len()];
407 let mut imported_by: Vec<HashSet<usize>> = vec![HashSet::new(); repo.files.len()];
408
409 for (idx, file) in repo.files.iter().enumerate() {
410 for symbol in &file.symbols {
412 if symbol.kind == SymbolKind::Import {
413 let import_name = &symbol.name;
415
416 let potential_paths = Self::resolve_import_paths(import_name, file);
418
419 for potential in potential_paths {
420 if let Some(&target_idx) = file_indices.get(potential.as_str()) {
421 if target_idx != idx {
422 imports_from[idx].insert(target_idx);
423 imported_by[target_idx].insert(idx);
424 }
425 }
426 }
427 }
428 }
429 }
430
431 let mut in_degree: Vec<usize> = imports_from.iter().map(|deps| deps.len()).collect();
433 let mut queue: VecDeque<usize> = in_degree
434 .iter()
435 .enumerate()
436 .filter_map(|(i, &d)| if d == 0 { Some(i) } else { None })
437 .collect();
438
439 let mut sorted_indices: Vec<usize> = Vec::with_capacity(repo.files.len());
440 let mut sorted_set: HashSet<usize> = HashSet::with_capacity(repo.files.len());
441
442 while let Some(idx) = queue.pop_front() {
443 sorted_indices.push(idx);
444 sorted_set.insert(idx);
445 for &dependent in &imported_by[idx] {
446 in_degree[dependent] -= 1;
447 if in_degree[dependent] == 0 {
448 queue.push_back(dependent);
449 }
450 }
451 }
452
453 if sorted_indices.len() < repo.files.len() {
456 for idx in 0..repo.files.len() {
457 if !sorted_set.contains(&idx) {
458 sorted_indices.push(idx);
459 }
460 }
461 }
462
463 let mut chunks = Vec::new();
465 let mut current_files = Vec::new();
466 let mut current_tokens = 0u32;
467 let mut current_deps: HashSet<usize> = HashSet::new();
468
469 for &idx in &sorted_indices {
470 let file = &repo.files[idx];
471 let file_tokens = file.token_count.get(self.model);
472
473 let depends_on_current = imports_from[idx].iter().any(|d| current_deps.contains(d));
475
476 let should_split = current_tokens + file_tokens > self.max_tokens
478 && !current_files.is_empty()
479 && !depends_on_current; if should_split {
482 chunks.push(self.create_chunk(chunks.len(), ¤t_files, current_tokens));
483 current_files.clear();
484 current_tokens = 0;
485 current_deps.clear();
486 }
487
488 current_files.push(file.clone());
489 current_tokens += file_tokens;
490 current_deps.insert(idx);
491 }
492
493 if !current_files.is_empty() {
494 chunks.push(self.create_chunk(chunks.len(), ¤t_files, current_tokens));
495 }
496
497 self.finalize_chunks(chunks, repo)
498 }
499
500 fn resolve_import_paths(import_name: &str, source_file: &RepoFile) -> Vec<String> {
502 let mut paths = Vec::new();
503 let source_dir = source_file
504 .relative_path
505 .rsplit_once('/')
506 .map(|(d, _)| d)
507 .unwrap_or("");
508
509 let normalized = import_name.replace("::", "/").replace(['.', '\\'], "/");
511
512 let extensions = ["py", "js", "ts", "tsx", "jsx", "rs", "go", "java", "rb"];
514 for ext in extensions {
515 paths.push(format!("{}.{}", normalized, ext));
517 paths.push(format!("{}/index.{}", normalized, ext));
518 paths.push(format!("{}/mod.{}", normalized, ext));
519
520 if !source_dir.is_empty() {
522 paths.push(format!("{}/{}.{}", source_dir, normalized, ext));
523 }
524 }
525
526 if import_name.contains('/') || import_name.contains('.') {
528 paths.push(import_name.to_owned());
529 }
530
531 paths
532 }
533
534 fn create_chunk(&self, index: usize, files: &[RepoFile], tokens: u32) -> Chunk {
535 let focus = self.determine_focus(files);
536
537 Chunk {
538 index,
539 total: 0, focus: focus.clone(),
541 tokens,
542 files: files
543 .iter()
544 .map(|f| ChunkFile {
545 path: f.relative_path.clone(),
546 content: f.content.clone().unwrap_or_default(),
547 tokens: f.token_count.get(self.model),
548 truncated: false,
549 })
550 .collect(),
551 context: ChunkContext {
552 previous_summary: None,
553 current_focus: focus,
554 next_preview: None,
555 cross_references: Vec::new(),
556 overlap_content: None,
557 },
558 }
559 }
560
561 fn build_symbol_chunk(
562 &self,
563 index: usize,
564 snippets: &[SymbolSnippet],
565 tokenizer: &crate::tokenizer::Tokenizer,
566 ) -> Chunk {
567 use std::collections::BTreeMap;
568
569 let focus = self.determine_symbol_focus(snippets);
570 let mut by_file: BTreeMap<&str, Vec<&SymbolSnippet>> = BTreeMap::new();
571
572 for snippet in snippets {
573 by_file
574 .entry(snippet.file_path.as_str())
575 .or_default()
576 .push(snippet);
577 }
578
579 let mut files = Vec::new();
580 let mut total_tokens = 0u32;
581
582 for (path, mut entries) in by_file {
583 entries.sort_by(|a, b| {
584 a.start_line
585 .cmp(&b.start_line)
586 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
587 });
588
589 let mut content = String::new();
590 for entry in entries {
591 if !content.is_empty() {
592 content.push_str("\n\n");
593 }
594 content.push_str(&entry.content);
595 }
596
597 let tokens = tokenizer.count(&content, self.model);
598 total_tokens += tokens;
599
600 files.push(ChunkFile { path: path.to_owned(), content, tokens, truncated: false });
601 }
602
603 Chunk {
604 index,
605 total: 0,
606 focus: focus.clone(),
607 tokens: total_tokens,
608 files,
609 context: ChunkContext {
610 previous_summary: None,
611 current_focus: focus,
612 next_preview: None,
613 cross_references: Vec::new(),
614 overlap_content: None,
615 },
616 }
617 }
618
619 fn determine_focus(&self, files: &[RepoFile]) -> String {
620 if files.is_empty() {
621 return "Empty".to_owned();
622 }
623
624 let first_path = &files[0].relative_path;
626 if let Some(module) = first_path.split('/').next() {
627 if files.iter().all(|f| f.relative_path.starts_with(module)) {
628 return format!("{} module", module);
629 }
630 }
631
632 if let Some(lang) = &files[0].language {
634 if files.iter().all(|f| f.language.as_ref() == Some(lang)) {
635 return format!("{} files", lang);
636 }
637 }
638
639 "Mixed content".to_owned()
640 }
641
642 fn determine_symbol_focus(&self, snippets: &[SymbolSnippet]) -> String {
643 if snippets.is_empty() {
644 return "Symbols".to_owned();
645 }
646
647 let mut names: Vec<String> = snippets
648 .iter()
649 .take(3)
650 .map(|snippet| snippet.symbol_name.clone())
651 .collect();
652
653 let suffix = if snippets.len() > names.len() {
654 format!(" +{} more", snippets.len() - names.len())
655 } else {
656 String::new()
657 };
658
659 if names.len() == 1 {
660 format!("Symbol: {}{}", names.remove(0), suffix)
661 } else {
662 format!("Symbols: {}{}", names.join(", "), suffix)
663 }
664 }
665
666 fn get_overlap_files(&self, files: &[RepoFile]) -> Vec<RepoFile> {
667 files
670 .last()
671 .filter(|f| f.token_count.get(self.model) < self.overlap_tokens)
672 .cloned()
673 .into_iter()
674 .collect()
675 }
676
677 fn finalize_chunks(&self, mut chunks: Vec<Chunk>, repo: &Repository) -> Vec<Chunk> {
678 let total = chunks.len();
679
680 let focus_strs: Vec<String> = chunks.iter().map(|c| c.focus.clone()).collect();
682
683 let overlap_contents: Vec<Option<String>> = if self.overlap_tokens > 0 {
685 chunks
686 .iter()
687 .map(|chunk| self.extract_overlap_content(chunk))
688 .collect()
689 } else {
690 vec![None; chunks.len()]
691 };
692
693 for (i, chunk) in chunks.iter_mut().enumerate() {
694 chunk.total = total;
695
696 if i > 0 {
698 chunk.context.previous_summary = Some(format!("Previous: {}", focus_strs[i - 1]));
699
700 if let Some(ref overlap) = overlap_contents[i - 1] {
702 chunk.context.overlap_content = Some(format!(
703 "<!-- [OVERLAP FROM PREVIOUS CHUNK] -->\n{}\n<!-- [END OVERLAP] -->",
704 overlap
705 ));
706 }
707 }
708
709 if i + 1 < total {
711 chunk.context.next_preview = Some(format!("Next: Chunk {}", i + 2));
712 }
713 }
714
715 self.populate_cross_references(&mut chunks, repo);
716
717 chunks
718 }
719
720 fn populate_cross_references(&self, chunks: &mut [Chunk], repo: &Repository) {
721 use std::collections::{HashMap, HashSet};
722
723 const MAX_REFS: usize = 25;
724
725 #[derive(Clone)]
726 struct SymbolLocation {
727 chunk_index: usize,
728 file: String,
729 }
730
731 let file_lookup: HashMap<&str, &RepoFile> = repo
732 .files
733 .iter()
734 .map(|file| (file.relative_path.as_str(), file))
735 .collect();
736
737 let mut symbol_index: HashMap<String, Vec<SymbolLocation>> = HashMap::new();
738 let mut seen_symbols: HashSet<(String, usize, String)> = HashSet::new();
739
740 for (chunk_index, chunk) in chunks.iter().enumerate() {
741 for chunk_file in &chunk.files {
742 if let Some(repo_file) = file_lookup.get(chunk_file.path.as_str()) {
743 for symbol in &repo_file.symbols {
744 if symbol.kind == SymbolKind::Import {
745 continue;
746 }
747 let key = (symbol.name.clone(), chunk_index, chunk_file.path.clone());
748 if seen_symbols.insert(key) {
749 symbol_index.entry(symbol.name.clone()).or_default().push(
750 SymbolLocation { chunk_index, file: chunk_file.path.clone() },
751 );
752 }
753 }
754 }
755 }
756 }
757
758 for (chunk_index, chunk) in chunks.iter_mut().enumerate() {
759 let mut refs: Vec<CrossReference> = Vec::new();
760 let mut seen_refs: HashSet<(String, usize, String)> = HashSet::new();
761
762 'files: for chunk_file in &chunk.files {
763 if let Some(repo_file) = file_lookup.get(chunk_file.path.as_str()) {
764 for symbol in &repo_file.symbols {
765 for called in &symbol.calls {
766 if let Some(targets) = symbol_index.get(called) {
767 for target in targets {
768 if target.chunk_index == chunk_index {
769 continue;
770 }
771 let key = (
772 called.to_owned(),
773 target.chunk_index,
774 target.file.clone(),
775 );
776 if seen_refs.insert(key) {
777 refs.push(CrossReference {
778 symbol: called.to_owned(),
779 chunk_index: target.chunk_index,
780 file: target.file.clone(),
781 });
782 if refs.len() >= MAX_REFS {
783 break 'files;
784 }
785 }
786 }
787 }
788 }
789
790 if let Some(ref base) = symbol.extends {
791 if let Some(targets) = symbol_index.get(base) {
792 for target in targets {
793 if target.chunk_index == chunk_index {
794 continue;
795 }
796 let key =
797 (base.to_owned(), target.chunk_index, target.file.clone());
798 if seen_refs.insert(key) {
799 refs.push(CrossReference {
800 symbol: base.to_owned(),
801 chunk_index: target.chunk_index,
802 file: target.file.clone(),
803 });
804 if refs.len() >= MAX_REFS {
805 break 'files;
806 }
807 }
808 }
809 }
810 }
811
812 for iface in &symbol.implements {
813 if let Some(targets) = symbol_index.get(iface) {
814 for target in targets {
815 if target.chunk_index == chunk_index {
816 continue;
817 }
818 let key =
819 (iface.to_owned(), target.chunk_index, target.file.clone());
820 if seen_refs.insert(key) {
821 refs.push(CrossReference {
822 symbol: iface.to_owned(),
823 chunk_index: target.chunk_index,
824 file: target.file.clone(),
825 });
826 if refs.len() >= MAX_REFS {
827 break 'files;
828 }
829 }
830 }
831 }
832 }
833 }
834 }
835 }
836
837 refs.sort_by(|a, b| {
838 a.chunk_index
839 .cmp(&b.chunk_index)
840 .then_with(|| a.symbol.cmp(&b.symbol))
841 .then_with(|| a.file.cmp(&b.file))
842 });
843 if refs.len() > MAX_REFS {
844 refs.truncate(MAX_REFS);
845 }
846
847 chunk.context.cross_references = refs;
848 }
849 }
850
851 fn extract_overlap_content(&self, chunk: &Chunk) -> Option<String> {
853 use crate::tokenizer::Tokenizer;
854
855 if self.overlap_tokens == 0 || chunk.files.is_empty() {
856 return None;
857 }
858
859 let tokenizer = Tokenizer::new();
860 let mut overlap_parts = Vec::new();
861 let mut remaining_tokens = self.overlap_tokens;
862 let token_model = self.model;
863
864 for file in chunk.files.iter().rev() {
866 if remaining_tokens == 0 {
867 break;
868 }
869
870 let file_tokens = tokenizer.count(&file.content, token_model);
871 if file_tokens <= remaining_tokens {
872 overlap_parts.push(format!("// From: {}\n{}", file.path, file.content));
874 remaining_tokens = remaining_tokens.saturating_sub(file_tokens);
875 } else {
876 let lines: Vec<&str> = file.content.lines().collect();
878 let mut partial_lines = Vec::new();
879 let mut partial_tokens = 0u32;
880
881 for line in lines.iter().rev() {
882 let line_tokens = tokenizer.count(line, token_model);
883 if partial_tokens + line_tokens > remaining_tokens {
884 break;
885 }
886 partial_lines.push(*line);
887 partial_tokens += line_tokens;
888 }
889
890 if !partial_lines.is_empty() {
891 partial_lines.reverse();
892 let partial_content = partial_lines.join("\n");
893 overlap_parts
894 .push(format!("// From: {} (partial)\n{}", file.path, partial_content));
895 }
896 remaining_tokens = 0;
897 }
898 }
899
900 if overlap_parts.is_empty() {
901 None
902 } else {
903 overlap_parts.reverse();
904 Some(overlap_parts.join("\n\n"))
905 }
906 }
907}
908
909#[cfg(test)]
910#[allow(clippy::str_to_string)]
911mod tests {
912 use super::*;
913 use crate::types::{Symbol, SymbolKind, TokenCounts, Visibility};
914
915 fn create_test_repo() -> Repository {
916 let mut repo = Repository::new("test", "/tmp/test");
917
918 for i in 0..5 {
919 repo.files.push(RepoFile {
920 path: format!("/tmp/test/src/file{}.py", i).into(),
921 relative_path: format!("src/file{}.py", i),
922 language: Some("python".to_string()),
923 size_bytes: 1000,
924 token_count: TokenCounts {
925 o200k: 480,
926 cl100k: 490,
927 claude: 500,
928 gemini: 470,
929 llama: 460,
930 mistral: 460,
931 deepseek: 460,
932 qwen: 460,
933 cohere: 465,
934 grok: 460,
935 },
936 symbols: Vec::new(),
937 importance: 0.5,
938 content: Some(format!("# File {}\ndef func{}(): pass", i, i)),
939 });
940 }
941
942 repo
943 }
944
945 #[test]
946 fn test_fixed_chunking() {
947 let repo = create_test_repo();
948 let chunker = Chunker::new(ChunkStrategy::Fixed { size: 1000 }, 1000);
949 let chunks = chunker.chunk(&repo);
950
951 assert!(!chunks.is_empty());
952 assert!(chunks
953 .iter()
954 .all(|c| c.tokens <= 1000 || c.files.len() == 1));
955 }
956
957 #[test]
958 fn test_file_chunking() {
959 let repo = create_test_repo();
960 let chunker = Chunker::new(ChunkStrategy::File, 8000);
961 let chunks = chunker.chunk(&repo);
962
963 assert_eq!(chunks.len(), repo.files.len());
964 }
965
966 #[test]
967 fn test_semantic_chunking() {
968 let repo = create_test_repo();
969 let chunker = Chunker::new(ChunkStrategy::Semantic, 2000);
970 let chunks = chunker.chunk(&repo);
971
972 assert!(!chunks.is_empty());
973 assert!(chunks.iter().all(|c| c.total == chunks.len()));
975 }
976
977 #[test]
978 fn test_symbol_chunking() {
979 let mut repo = create_test_repo();
980 if let Some(file) = repo.files.get_mut(0) {
981 let mut symbol = Symbol::new("func0", SymbolKind::Function);
982 symbol.start_line = 1;
983 symbol.end_line = 1;
984 symbol.visibility = Visibility::Public;
985 file.symbols.push(symbol);
986 }
987
988 let chunker = Chunker::new(ChunkStrategy::Symbol, 500);
989 let chunks = chunker.chunk(&repo);
990
991 assert!(!chunks.is_empty());
992 assert!(chunks.iter().all(|c| c.total == chunks.len()));
993 }
994}