1mod strategies;
7mod types;
8
9use types::SymbolSnippet;
10pub use types::{Chunk, ChunkContext, ChunkFile, ChunkStrategy, Chunker, CrossReference};
11
12use crate::tokenizer::Tokenizer;
13use crate::types::{RepoFile, Repository, SymbolKind, TokenizerModel};
14use std::collections::{BTreeMap, HashMap, HashSet};
15
16impl Chunker {
17 pub fn new(strategy: ChunkStrategy, max_tokens: u32) -> Self {
19 Self { strategy, max_tokens, overlap_tokens: 200, model: TokenizerModel::Claude }
20 }
21
22 pub fn with_overlap(mut self, tokens: u32) -> Self {
24 self.overlap_tokens = tokens;
25 self
26 }
27
28 pub fn with_model(mut self, model: TokenizerModel) -> Self {
30 self.model = model;
31 self
32 }
33
34 pub fn chunk(&self, repo: &Repository) -> Vec<Chunk> {
36 match self.strategy {
37 ChunkStrategy::Fixed { size } => self.fixed_chunk(repo, size),
38 ChunkStrategy::File => self.file_chunk(repo),
39 ChunkStrategy::Module => self.module_chunk(repo),
40 ChunkStrategy::Symbol => self.symbol_chunk(repo),
41 ChunkStrategy::Semantic => self.semantic_chunk(repo),
42 ChunkStrategy::Dependency => self.dependency_chunk(repo),
43 }
44 }
45
46 pub(crate) fn create_chunk(&self, index: usize, files: &[RepoFile], tokens: u32) -> Chunk {
51 let focus = self.determine_focus(files);
52
53 Chunk {
54 index,
55 total: 0, focus: focus.clone(),
57 tokens,
58 files: files
59 .iter()
60 .map(|f| ChunkFile {
61 path: f.relative_path.clone(),
62 content: f.content.clone().unwrap_or_default(),
63 tokens: f.token_count.get(self.model),
64 truncated: false,
65 })
66 .collect(),
67 context: ChunkContext {
68 previous_summary: None,
69 current_focus: focus,
70 next_preview: None,
71 cross_references: Vec::new(),
72 overlap_content: None,
73 },
74 }
75 }
76
77 pub(crate) fn create_chunk_from_refs(
79 &self,
80 index: usize,
81 files: &[&RepoFile],
82 tokens: u32,
83 ) -> Chunk {
84 let focus = self.determine_focus_refs(files);
85
86 Chunk {
87 index,
88 total: 0, focus: focus.clone(),
90 tokens,
91 files: files
92 .iter()
93 .map(|f| ChunkFile {
94 path: f.relative_path.clone(),
95 content: f.content.clone().unwrap_or_default(),
96 tokens: f.token_count.get(self.model),
97 truncated: false,
98 })
99 .collect(),
100 context: ChunkContext {
101 previous_summary: None,
102 current_focus: focus,
103 next_preview: None,
104 cross_references: Vec::new(),
105 overlap_content: None,
106 },
107 }
108 }
109
110 pub(crate) fn build_symbol_chunk(
111 &self,
112 index: usize,
113 snippets: &[SymbolSnippet],
114 tokenizer: &Tokenizer,
115 ) -> Chunk {
116 let focus = self.determine_symbol_focus(snippets);
117 let mut by_file: BTreeMap<&str, Vec<&SymbolSnippet>> = BTreeMap::new();
118
119 for snippet in snippets {
120 by_file
121 .entry(snippet.file_path.as_str())
122 .or_default()
123 .push(snippet);
124 }
125
126 let mut files = Vec::new();
127 let mut total_tokens = 0u32;
128
129 for (path, mut entries) in by_file {
130 entries.sort_by(|a, b| {
131 a.start_line
132 .cmp(&b.start_line)
133 .then_with(|| a.symbol_name.cmp(&b.symbol_name))
134 });
135
136 let mut content = String::new();
137 for entry in entries {
138 if !content.is_empty() {
139 content.push_str("\n\n");
140 }
141 content.push_str(&entry.content);
142 }
143
144 let tokens = tokenizer.count(&content, self.model);
145 total_tokens += tokens;
146
147 files.push(ChunkFile { path: path.to_owned(), content, tokens, truncated: false });
148 }
149
150 Chunk {
151 index,
152 total: 0,
153 focus: focus.clone(),
154 tokens: total_tokens,
155 files,
156 context: ChunkContext {
157 previous_summary: None,
158 current_focus: focus,
159 next_preview: None,
160 cross_references: Vec::new(),
161 overlap_content: None,
162 },
163 }
164 }
165
166 fn determine_focus(&self, files: &[RepoFile]) -> String {
171 if files.is_empty() {
172 return "Empty".to_owned();
173 }
174
175 let first_path = &files[0].relative_path;
177 if let Some(module) = first_path.split('/').next() {
178 if files.iter().all(|f| f.relative_path.starts_with(module)) {
179 return format!("{} module", module);
180 }
181 }
182
183 if let Some(lang) = &files[0].language {
185 if files.iter().all(|f| f.language.as_ref() == Some(lang)) {
186 return format!("{} files", lang);
187 }
188 }
189
190 "Mixed content".to_owned()
191 }
192
193 fn determine_focus_refs(&self, files: &[&RepoFile]) -> String {
195 if files.is_empty() {
196 return "Empty".to_owned();
197 }
198
199 let first_path = &files[0].relative_path;
201 if let Some(module) = first_path.split('/').next() {
202 if files.iter().all(|f| f.relative_path.starts_with(module)) {
203 return format!("{} module", module);
204 }
205 }
206
207 if let Some(lang) = &files[0].language {
209 if files.iter().all(|f| f.language.as_ref() == Some(lang)) {
210 return format!("{} files", lang);
211 }
212 }
213
214 "Mixed content".to_owned()
215 }
216
217 fn determine_symbol_focus(&self, snippets: &[SymbolSnippet]) -> String {
218 if snippets.is_empty() {
219 return "Symbols".to_owned();
220 }
221
222 let mut names: Vec<String> = snippets
223 .iter()
224 .take(3)
225 .map(|snippet| snippet.symbol_name.clone())
226 .collect();
227
228 let suffix = if snippets.len() > names.len() {
229 format!(" +{} more", snippets.len() - names.len())
230 } else {
231 String::new()
232 };
233
234 if names.len() == 1 {
235 format!("Symbol: {}{}", names.remove(0), suffix)
236 } else {
237 format!("Symbols: {}{}", names.join(", "), suffix)
238 }
239 }
240
241 pub(crate) fn get_overlap_files(&self, files: &[RepoFile]) -> Vec<RepoFile> {
246 files
249 .last()
250 .filter(|f| f.token_count.get(self.model) < self.overlap_tokens)
251 .cloned()
252 .into_iter()
253 .collect()
254 }
255
256 pub(crate) fn finalize_chunks(&self, mut chunks: Vec<Chunk>, repo: &Repository) -> Vec<Chunk> {
257 let total = chunks.len();
258
259 let focus_strs: Vec<String> = chunks.iter().map(|c| c.focus.clone()).collect();
261
262 let overlap_contents: Vec<Option<String>> = if self.overlap_tokens > 0 {
264 chunks
265 .iter()
266 .map(|chunk| self.extract_overlap_content(chunk))
267 .collect()
268 } else {
269 vec![None; chunks.len()]
270 };
271
272 for (i, chunk) in chunks.iter_mut().enumerate() {
273 chunk.total = total;
274
275 if i > 0 {
277 chunk.context.previous_summary = Some(format!("Previous: {}", focus_strs[i - 1]));
278
279 if let Some(ref overlap) = overlap_contents[i - 1] {
281 chunk.context.overlap_content = Some(format!(
282 "<!-- [OVERLAP FROM PREVIOUS CHUNK] -->\n{}\n<!-- [END OVERLAP] -->",
283 overlap
284 ));
285 }
286 }
287
288 if i + 1 < total {
290 chunk.context.next_preview = Some(format!("Next: Chunk {}", i + 2));
291 }
292 }
293
294 self.populate_cross_references(&mut chunks, repo);
295
296 chunks
297 }
298
299 fn populate_cross_references(&self, chunks: &mut [Chunk], repo: &Repository) {
300 const MAX_REFS: usize = 25;
301
302 #[derive(Clone)]
303 struct SymbolLocation {
304 chunk_index: usize,
305 file: String,
306 }
307
308 let file_lookup: HashMap<&str, &RepoFile> = repo
309 .files
310 .iter()
311 .map(|file| (file.relative_path.as_str(), file))
312 .collect();
313
314 let mut symbol_index: HashMap<String, Vec<SymbolLocation>> = HashMap::new();
315 let mut seen_symbols: HashSet<(String, usize, String)> = HashSet::new();
316
317 for (chunk_index, chunk) in chunks.iter().enumerate() {
318 for chunk_file in &chunk.files {
319 if let Some(repo_file) = file_lookup.get(chunk_file.path.as_str()) {
320 for symbol in &repo_file.symbols {
321 if symbol.kind == SymbolKind::Import {
322 continue;
323 }
324 let key = (symbol.name.clone(), chunk_index, chunk_file.path.clone());
325 if seen_symbols.insert(key) {
326 symbol_index.entry(symbol.name.clone()).or_default().push(
327 SymbolLocation { chunk_index, file: chunk_file.path.clone() },
328 );
329 }
330 }
331 }
332 }
333 }
334
335 for (chunk_index, chunk) in chunks.iter_mut().enumerate() {
336 let mut refs: Vec<CrossReference> = Vec::new();
337 let mut seen_refs: HashSet<(String, usize, String)> = HashSet::new();
338
339 'files: for chunk_file in &chunk.files {
340 if let Some(repo_file) = file_lookup.get(chunk_file.path.as_str()) {
341 for symbol in &repo_file.symbols {
342 for called in &symbol.calls {
343 if let Some(targets) = symbol_index.get(called) {
344 for target in targets {
345 if target.chunk_index == chunk_index {
346 continue;
347 }
348 let key = (
349 called.to_owned(),
350 target.chunk_index,
351 target.file.clone(),
352 );
353 if seen_refs.insert(key) {
354 refs.push(CrossReference {
355 symbol: called.to_owned(),
356 chunk_index: target.chunk_index,
357 file: target.file.clone(),
358 });
359 if refs.len() >= MAX_REFS {
360 break 'files;
361 }
362 }
363 }
364 }
365 }
366
367 if let Some(ref base) = symbol.extends {
368 if let Some(targets) = symbol_index.get(base) {
369 for target in targets {
370 if target.chunk_index == chunk_index {
371 continue;
372 }
373 let key =
374 (base.to_owned(), target.chunk_index, target.file.clone());
375 if seen_refs.insert(key) {
376 refs.push(CrossReference {
377 symbol: base.to_owned(),
378 chunk_index: target.chunk_index,
379 file: target.file.clone(),
380 });
381 if refs.len() >= MAX_REFS {
382 break 'files;
383 }
384 }
385 }
386 }
387 }
388
389 for iface in &symbol.implements {
390 if let Some(targets) = symbol_index.get(iface) {
391 for target in targets {
392 if target.chunk_index == chunk_index {
393 continue;
394 }
395 let key =
396 (iface.to_owned(), target.chunk_index, target.file.clone());
397 if seen_refs.insert(key) {
398 refs.push(CrossReference {
399 symbol: iface.to_owned(),
400 chunk_index: target.chunk_index,
401 file: target.file.clone(),
402 });
403 if refs.len() >= MAX_REFS {
404 break 'files;
405 }
406 }
407 }
408 }
409 }
410 }
411 }
412 }
413
414 refs.sort_by(|a, b| {
415 a.chunk_index
416 .cmp(&b.chunk_index)
417 .then_with(|| a.symbol.cmp(&b.symbol))
418 .then_with(|| a.file.cmp(&b.file))
419 });
420 if refs.len() > MAX_REFS {
421 refs.truncate(MAX_REFS);
422 }
423
424 chunk.context.cross_references = refs;
425 }
426 }
427
428 fn extract_overlap_content(&self, chunk: &Chunk) -> Option<String> {
430 if self.overlap_tokens == 0 || chunk.files.is_empty() {
431 return None;
432 }
433
434 let tokenizer = Tokenizer::new();
435 let mut overlap_parts = Vec::new();
436 let mut remaining_tokens = self.overlap_tokens;
437 let token_model = self.model;
438
439 for file in chunk.files.iter().rev() {
441 if remaining_tokens == 0 {
442 break;
443 }
444
445 let file_tokens = tokenizer.count(&file.content, token_model);
446 if file_tokens <= remaining_tokens {
447 overlap_parts.push(format!("// From: {}\n{}", file.path, file.content));
449 remaining_tokens = remaining_tokens.saturating_sub(file_tokens);
450 } else {
451 let lines: Vec<&str> = file.content.lines().collect();
453 let mut partial_lines = Vec::new();
454 let mut partial_tokens = 0u32;
455
456 for line in lines.iter().rev() {
457 let line_tokens = tokenizer.count(line, token_model);
458 if partial_tokens + line_tokens > remaining_tokens {
459 break;
460 }
461 partial_lines.push(*line);
462 partial_tokens += line_tokens;
463 }
464
465 if !partial_lines.is_empty() {
466 partial_lines.reverse();
467 let partial_content = partial_lines.join("\n");
468 overlap_parts
469 .push(format!("// From: {} (partial)\n{}", file.path, partial_content));
470 }
471 remaining_tokens = 0;
472 }
473 }
474
475 if overlap_parts.is_empty() {
476 None
477 } else {
478 overlap_parts.reverse();
479 Some(overlap_parts.join("\n\n"))
480 }
481 }
482}
483
484#[cfg(test)]
485#[allow(clippy::str_to_string)]
486mod tests {
487 use super::*;
488 use crate::types::{Symbol, SymbolKind, TokenCounts, Visibility};
489
490 fn create_test_repo() -> Repository {
491 let mut repo = Repository::new("test", "/tmp/test");
492
493 for i in 0..5 {
494 repo.files.push(RepoFile {
495 path: format!("/tmp/test/src/file{}.py", i).into(),
496 relative_path: format!("src/file{}.py", i),
497 language: Some("python".to_string()),
498 size_bytes: 1000,
499 token_count: TokenCounts {
500 o200k: 480,
501 cl100k: 490,
502 claude: 500,
503 gemini: 470,
504 llama: 460,
505 mistral: 460,
506 deepseek: 460,
507 qwen: 460,
508 cohere: 465,
509 grok: 460,
510 },
511 symbols: Vec::new(),
512 importance: 0.5,
513 content: Some(format!("# File {}\ndef func{}(): pass", i, i)),
514 });
515 }
516
517 repo
518 }
519
520 #[test]
521 fn test_fixed_chunking() {
522 let repo = create_test_repo();
523 let chunker = Chunker::new(ChunkStrategy::Fixed { size: 1000 }, 1000);
524 let chunks = chunker.chunk(&repo);
525
526 assert!(!chunks.is_empty());
527 assert!(chunks
528 .iter()
529 .all(|c| c.tokens <= 1000 || c.files.len() == 1));
530 }
531
532 #[test]
533 fn test_file_chunking() {
534 let repo = create_test_repo();
535 let chunker = Chunker::new(ChunkStrategy::File, 8000);
536 let chunks = chunker.chunk(&repo);
537
538 assert_eq!(chunks.len(), repo.files.len());
539 }
540
541 #[test]
542 fn test_semantic_chunking() {
543 let repo = create_test_repo();
544 let chunker = Chunker::new(ChunkStrategy::Semantic, 2000);
545 let chunks = chunker.chunk(&repo);
546
547 assert!(!chunks.is_empty());
548 assert!(chunks.iter().all(|c| c.total == chunks.len()));
550 }
551
552 #[test]
553 fn test_symbol_chunking() {
554 let mut repo = create_test_repo();
555 if let Some(file) = repo.files.get_mut(0) {
556 let mut symbol = Symbol::new("func0", SymbolKind::Function);
557 symbol.start_line = 1;
558 symbol.end_line = 1;
559 symbol.visibility = Visibility::Public;
560 file.symbols.push(symbol);
561 }
562
563 let chunker = Chunker::new(ChunkStrategy::Symbol, 500);
564 let chunks = chunker.chunk(&repo);
565
566 assert!(!chunks.is_empty());
567 assert!(chunks.iter().all(|c| c.total == chunks.len()));
568 }
569}