1use std::path::Path;
9use streaming_iterator::StreamingIterator;
10use tree_sitter::{Parser, QueryCursor};
11
12#[derive(Debug, Clone)]
17pub struct ChunkConfig {
18 pub max_chunk_bytes: usize,
22 pub window_size: usize,
26 pub window_overlap: usize,
30}
31
32impl Default for ChunkConfig {
33 fn default() -> Self {
34 Self {
35 max_chunk_bytes: 4096,
36 window_size: 2048,
37 window_overlap: 512,
38 }
39 }
40}
41
42#[derive(
44 Debug,
45 Clone,
46 rkyv::Archive,
47 rkyv::Serialize,
48 rkyv::Deserialize,
49 bitcode::Encode,
50 bitcode::Decode,
51)]
52pub struct CodeChunk {
53 pub file_path: String,
55 pub name: String,
57 pub kind: String,
59 pub start_line: usize,
61 pub end_line: usize,
63 pub content: String,
65 pub enriched_content: String,
68}
69
70#[must_use]
77pub fn build_scope_chain(node: tree_sitter::Node<'_>, source: &str) -> String {
78 const CONTAINER_KINDS: &[&str] = &[
80 "impl_item",
82 "trait_item",
83 "mod_item",
84 "class_definition",
86 "module",
87 "class_declaration",
89 "type_declaration",
93 "namespace_definition",
95 "class_specifier",
96 ];
97
98 const NAME_FIELDS: &[&str] = &["name", "type"];
102
103 let mut parts = Vec::new();
104 let mut current = node.parent();
105 while let Some(parent) = current {
106 let kind = parent.kind();
107 if CONTAINER_KINDS.contains(&kind) {
108 let name = NAME_FIELDS
109 .iter()
110 .find_map(|field| parent.child_by_field_name(field))
111 .map_or(kind, |n| &source[n.start_byte()..n.end_byte()]);
112 parts.push(format!("{kind} {name}"));
113 }
114 current = parent.parent();
115 }
116 parts.reverse();
117 parts.join(" > ")
118}
119
120#[must_use]
126pub fn extract_signature(node: tree_sitter::Node<'_>, source: &str) -> Option<String> {
127 let name_node = node.child_by_field_name("name")?;
128 let body_node = node
129 .child_by_field_name("body")
130 .or_else(|| node.child_by_field_name("block"))?;
131 let start = name_node.start_byte();
132 let end = body_node.start_byte();
133 if start >= end {
134 return None;
135 }
136 let sig = source[start..end].trim();
137 if sig.is_empty() {
138 None
139 } else {
140 Some(sig.to_string())
141 }
142}
143
144#[must_use]
155pub fn minify_whitespace(source: &str) -> String {
156 let mut result = String::with_capacity(source.len());
157 let mut consecutive_blank = 0usize;
158
159 for line in source.lines() {
160 let leading = line
162 .chars()
163 .take_while(|c| *c == ' ' || *c == '\t')
164 .fold(0usize, |acc, c| acc + if c == '\t' { 2 } else { 1 });
165 let rest = line.trim_start();
166
167 if rest.is_empty() {
168 consecutive_blank += 1;
171 if consecutive_blank == 1 {
172 result.push('\n');
173 }
174 } else {
175 consecutive_blank = 0;
176 let indent_level = leading.div_ceil(2);
179 for _ in 0..indent_level {
180 result.push(' ');
181 }
182 result.push_str(rest.trim_end());
183 result.push('\n');
184 }
185 }
186
187 if !source.ends_with('\n') && result.ends_with('\n') {
189 result.pop();
190 }
191
192 result
193}
194
195fn build_enriched_content(
200 path: &Path,
201 node: tree_sitter::Node<'_>,
202 source: &str,
203 content: &str,
204 max_bytes: usize,
205) -> String {
206 let scope = build_scope_chain(node, source);
207 let sig = extract_signature(node, source).unwrap_or_default();
208 let rel_path = path.display().to_string();
209
210 let header = if scope.is_empty() && sig.is_empty() {
211 format!("// {rel_path}\n")
212 } else if scope.is_empty() {
213 format!("// {rel_path} | defines: {sig}\n")
214 } else if sig.is_empty() {
215 format!("// {rel_path} | {scope}\n")
216 } else {
217 format!("// {rel_path} | {scope} | defines: {sig}\n")
218 };
219
220 let minified = minify_whitespace(content);
223
224 if header.len() + minified.len() > max_bytes {
225 minified
226 } else {
227 format!("{header}{minified}")
228 }
229}
230
231#[must_use]
240pub fn chunk_file(
241 path: &Path,
242 source: &str,
243 config: &crate::languages::LangConfig,
244 chunk_config: &ChunkConfig,
245) -> Vec<CodeChunk> {
246 let mut parser = Parser::new();
247 if parser.set_language(&config.language).is_err() {
248 return sliding_windows(path, source, chunk_config);
249 }
250
251 let Some(tree) = parser.parse(source, None) else {
252 return sliding_windows(path, source, chunk_config);
253 };
254
255 let mut cursor = QueryCursor::new();
256 let mut chunks = Vec::new();
257 let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
258
259 while let Some(m) = matches.next() {
260 let mut name = String::new();
261 let mut def_node = None;
262 for cap in m.captures {
263 let cap_name = &config.query.capture_names()[cap.index as usize];
264 if *cap_name == "name" {
265 name = source[cap.node.start_byte()..cap.node.end_byte()].to_string();
266 } else if *cap_name == "def" {
267 def_node = Some(cap.node);
268 }
269 }
270 if let Some(node) = def_node {
271 let content = &source[node.start_byte()..node.end_byte()];
272 let start_line = node.start_position().row + 1;
273
274 if content.len() > chunk_config.max_chunk_bytes {
276 chunks.extend(sliding_windows_with_name(
277 path,
278 content,
279 &name,
280 start_line,
281 chunk_config,
282 ));
283 } else {
284 let enriched = build_enriched_content(
285 path,
286 node,
287 source,
288 content,
289 chunk_config.max_chunk_bytes,
290 );
291 chunks.push(CodeChunk {
292 file_path: path.display().to_string(),
293 name,
294 kind: node.kind().to_string(),
295 start_line,
296 end_line: node.end_position().row + 1,
297 enriched_content: enriched,
298 content: content.to_string(),
299 });
300 }
301 }
302 }
303
304 if chunks.is_empty() && !source.trim().is_empty() {
306 return sliding_windows(path, source, chunk_config);
307 }
308
309 chunks
310}
311
312#[must_use]
322pub fn chunk_text(path: &Path, source: &str, chunk_config: &ChunkConfig) -> Vec<CodeChunk> {
323 sliding_windows(path, source, chunk_config)
324}
325
326fn sliding_windows(path: &Path, source: &str, chunk_config: &ChunkConfig) -> Vec<CodeChunk> {
328 if source.trim().is_empty() {
329 return vec![];
330 }
331
332 if source.len() <= chunk_config.max_chunk_bytes {
334 let content = source.to_string();
335 return vec![CodeChunk {
336 file_path: path.display().to_string(),
337 name: path
338 .file_name()
339 .unwrap_or_default()
340 .to_string_lossy()
341 .to_string(),
342 kind: "file".to_string(),
343 start_line: 1,
344 end_line: source.lines().count(),
345 enriched_content: content.clone(),
346 content,
347 }];
348 }
349
350 let file_name = path
351 .file_name()
352 .unwrap_or_default()
353 .to_string_lossy()
354 .to_string();
355 sliding_window_chunks(source, path, &file_name, 1, chunk_config)
356}
357
358fn sliding_windows_with_name(
363 path: &Path,
364 content: &str,
365 name: &str,
366 base_line: usize,
367 chunk_config: &ChunkConfig,
368) -> Vec<CodeChunk> {
369 sliding_window_chunks(content, path, name, base_line, chunk_config)
370}
371
372fn sliding_window_chunks(
379 source: &str,
380 file_path: &Path,
381 name_prefix: &str,
382 base_line: usize,
383 chunk_config: &ChunkConfig,
384) -> Vec<CodeChunk> {
385 let step = chunk_config
386 .window_size
387 .saturating_sub(chunk_config.window_overlap)
388 .max(1);
389 let bytes = source.as_bytes();
390 let mut chunks = Vec::new();
391 let mut offset = 0;
392 let mut window_idx = 0;
393
394 while offset < bytes.len() {
395 let raw_end = (offset + chunk_config.window_size).min(bytes.len());
396
397 let end = if raw_end < bytes.len() {
399 match bytes[offset..raw_end].iter().rposition(|&b| b == b'\n') {
400 Some(pos) => offset + pos + 1,
401 None => raw_end, }
403 } else {
404 raw_end
405 };
406
407 if let Ok(window) = std::str::from_utf8(&bytes[offset..end])
409 && !window.trim().is_empty()
410 {
411 let start_line = base_line + source[..offset].matches('\n').count();
412 let content_lines = window.lines().count().max(1);
413 let end_line = start_line + content_lines - 1;
414 let content = window.to_string();
415 chunks.push(CodeChunk {
416 file_path: file_path.display().to_string(),
417 name: format!("{name_prefix}[{window_idx}]"),
418 kind: "window".to_string(),
419 start_line,
420 end_line,
421 enriched_content: content.clone(),
422 content,
423 });
424 window_idx += 1;
425 }
426
427 offset += step;
428 }
429
430 chunks
431}
432
433#[cfg(test)]
434mod tests {
435 use super::*;
436 use std::fmt::Write as _;
437 use std::path::Path;
438
439 #[test]
440 fn chunks_rust_functions_and_structs() {
441 let source = "fn hello() { println!(\"hi\"); }\nfn world() {}\nstruct Foo { x: i32 }";
442 let config = crate::languages::config_for_extension("rs").unwrap();
443 let chunks = chunk_file(
444 Path::new("test.rs"),
445 source,
446 &config,
447 &ChunkConfig::default(),
448 );
449 assert!(
450 chunks.len() >= 2,
451 "expected at least 2 chunks, got {}",
452 chunks.len()
453 );
454 assert!(chunks.iter().any(|c| c.name == "hello"));
455 assert!(chunks.iter().any(|c| c.name == "world"));
456 }
457
458 #[test]
459 fn chunks_python_functions_and_classes() {
460 let source = "def greet(name):\n pass\n\nclass Foo:\n pass\n";
461 let config = crate::languages::config_for_extension("py").unwrap();
462 let chunks = chunk_file(
463 Path::new("test.py"),
464 source,
465 &config,
466 &ChunkConfig::default(),
467 );
468 assert!(chunks.len() >= 2);
469 assert!(chunks.iter().any(|c| c.name == "greet"));
470 assert!(chunks.iter().any(|c| c.name == "Foo"));
471 }
472
473 #[test]
474 fn chunks_python_stub_functions_and_classes() {
475 let source = "from typing import Protocol\n\ndef greet(name: str) -> str: ...\n\nclass Foo(Protocol):\n value: int\n";
476 let config = crate::languages::config_for_extension("pyi").unwrap();
477 let chunks = chunk_file(
478 Path::new("test.pyi"),
479 source,
480 &config,
481 &ChunkConfig::default(),
482 );
483 assert!(chunks.len() >= 2);
484 assert!(chunks.iter().any(|c| c.name == "greet"));
485 assert!(chunks.iter().any(|c| c.name == "Foo"));
486 }
487
488 #[test]
489 fn fallback_small_file_single_chunk() {
490 let source = "// just a comment\n// and another\n";
493 let config = crate::languages::config_for_extension("js").unwrap();
494 let chunks = chunk_file(
495 Path::new("script.js"),
496 source,
497 &config,
498 &ChunkConfig::default(),
499 );
500 assert_eq!(chunks.len(), 1);
501 assert_eq!(chunks[0].kind, "file");
502 }
503
504 #[test]
505 fn fallback_large_file_produces_windows() {
506 let line = "console.log('hello world, this is a long line of javascript code');\n";
508 let source: String = line.repeat(200); let chunk_config = ChunkConfig::default();
510 assert!(source.len() > chunk_config.max_chunk_bytes);
511
512 let config = crate::languages::config_for_extension("js").unwrap();
513 let chunks = chunk_file(Path::new("big.js"), &source, &config, &chunk_config);
514 assert!(
515 chunks.len() > 1,
516 "expected multiple windows, got {}",
517 chunks.len()
518 );
519 assert!(chunks.iter().all(|c| c.kind == "window"));
520 assert!(chunks[0].name.contains("[0]"));
521 }
522
523 #[test]
524 fn large_definition_is_windowed() {
525 let mut source = String::from("fn big_function() {\n");
527 for i in 0..200 {
528 writeln!(source, " let var_{i} = {i} * 2 + 1; // some computation").unwrap();
529 }
530 source.push_str("}\n");
531 let chunk_config = ChunkConfig::default();
532 assert!(source.len() > chunk_config.max_chunk_bytes);
533
534 let config = crate::languages::config_for_extension("rs").unwrap();
535 let chunks = chunk_file(Path::new("test.rs"), &source, &config, &chunk_config);
536 assert!(
537 chunks.len() > 1,
538 "expected windowed chunks, got {}",
539 chunks.len()
540 );
541 assert!(chunks[0].name.starts_with("big_function["));
542 }
543
544 #[test]
545 fn empty_file_produces_no_chunks() {
546 let config = crate::languages::config_for_extension("rs").unwrap();
547 let chunks = chunk_file(Path::new("empty.rs"), "", &config, &ChunkConfig::default());
548 assert!(chunks.is_empty());
549 }
550
551 fn first_def_node(
555 source: &str,
556 ext: &str,
557 ) -> (
558 tree_sitter::Tree,
559 std::sync::Arc<crate::languages::LangConfig>,
560 ) {
561 let config = crate::languages::config_for_extension(ext).unwrap();
562 let mut parser = Parser::new();
563 parser.set_language(&config.language).unwrap();
564 let tree = parser.parse(source, None).unwrap();
565 (tree, config)
566 }
567
568 #[test]
569 fn scope_chain_rust_impl_method() {
570 let source = "impl Foo {\n fn bar(&self) {}\n}";
571 let (tree, config) = first_def_node(source, "rs");
572 let mut cursor = QueryCursor::new();
573 let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
574
575 let mut def_node = None;
576 while let Some(m) = StreamingIterator::next(&mut matches) {
577 for cap in m.captures {
578 let cap_name = &config.query.capture_names()[cap.index as usize];
579 if *cap_name == "def" {
580 def_node = Some(cap.node);
581 }
582 }
583 }
584 let node = def_node.expect("should find a @def node");
585 let scope = build_scope_chain(node, source);
586 assert!(
587 scope.contains("impl_item"),
588 "scope should contain impl_item, got: {scope}"
589 );
590 assert!(
591 scope.contains("Foo"),
592 "scope should contain 'Foo', got: {scope}"
593 );
594 }
595
596 #[test]
597 fn scope_chain_python_class_method() {
598 let source = "class Greeter:\n def say_hello(self):\n pass\n";
599 let (tree, config) = first_def_node(source, "py");
600 let mut cursor = QueryCursor::new();
601 let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
602
603 let mut fn_node = None;
605 while let Some(m) = StreamingIterator::next(&mut matches) {
606 for cap in m.captures {
607 let cap_name = &config.query.capture_names()[cap.index as usize];
608 if *cap_name == "def" && cap.node.kind() == "function_definition" {
609 fn_node = Some(cap.node);
610 }
611 }
612 }
613 let node = fn_node.expect("should find say_hello @def node");
614 let scope = build_scope_chain(node, source);
615 assert!(
616 scope.contains("class_definition"),
617 "scope should contain class_definition, got: {scope}"
618 );
619 assert!(
620 scope.contains("Greeter"),
621 "scope should contain 'Greeter', got: {scope}"
622 );
623 }
624
625 #[test]
626 fn extract_signature_rust_function() {
627 let source = "fn greet(name: &str) -> String { name.to_string() }";
628 let (tree, config) = first_def_node(source, "rs");
629 let mut cursor = QueryCursor::new();
630 let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
631
632 let mut def_node = None;
633 while let Some(m) = StreamingIterator::next(&mut matches) {
634 for cap in m.captures {
635 let cap_name = &config.query.capture_names()[cap.index as usize];
636 if *cap_name == "def" {
637 def_node = Some(cap.node);
638 }
639 }
640 }
641 let node = def_node.expect("should find @def node");
642 let sig = extract_signature(node, source).expect("should extract signature");
643 assert!(
644 sig.contains("greet"),
645 "signature should contain 'greet', got: {sig}"
646 );
647 assert!(
648 sig.contains("name: &str"),
649 "signature should contain parameter, got: {sig}"
650 );
651 assert!(
652 sig.contains("-> String"),
653 "signature should contain return type, got: {sig}"
654 );
655 }
656
657 #[test]
658 fn enriched_content_has_header() {
659 let source = "fn hello() { println!(\"hi\"); }";
660 let config = crate::languages::config_for_extension("rs").unwrap();
661 let chunks = chunk_file(
662 Path::new("src/main.rs"),
663 source,
664 &config,
665 &ChunkConfig::default(),
666 );
667 assert!(!chunks.is_empty());
668 let chunk = &chunks[0];
669 assert!(
670 chunk.enriched_content.starts_with("//"),
671 "enriched_content should start with '//' header, got: {}",
672 &chunk.enriched_content[..chunk.enriched_content.len().min(80)]
673 );
674 assert!(
675 chunk.enriched_content.contains("src/main.rs"),
676 "enriched_content should contain file path"
677 );
678 assert!(
680 !chunk.content.starts_with("//"),
681 "raw content should not start with header"
682 );
683 }
684
685 #[test]
686 fn sliding_window_enriched_equals_content() {
687 let source = "let x = 42;\nconsole.log(x);\n";
688 let chunks = chunk_text(Path::new("test.txt"), source, &ChunkConfig::default());
689 assert!(!chunks.is_empty());
690 for chunk in &chunks {
691 assert_eq!(
692 chunk.enriched_content, chunk.content,
693 "sliding window chunks should have enriched_content == content"
694 );
695 }
696 }
697
698 #[test]
699 fn header_dropped_when_exceeding_max_bytes() {
700 let tiny_config = ChunkConfig {
703 max_chunk_bytes: 60,
704 window_size: 30,
705 window_overlap: 10,
706 };
707 let source = "fn f() { let x = 42; return x; }";
709 assert!(source.len() <= tiny_config.max_chunk_bytes);
710
711 let config = crate::languages::config_for_extension("rs").unwrap();
712 let chunks = chunk_file(
713 Path::new("long/path/to/file.rs"),
714 source,
715 &config,
716 &tiny_config,
717 );
718 assert!(!chunks.is_empty());
719 let chunk = &chunks[0];
720 assert!(
724 !chunk.enriched_content.starts_with("//"),
725 "header should be dropped when it would exceed max_chunk_bytes"
726 );
727 assert_eq!(chunk.content, source, "raw content should be unchanged");
728 }
729
730 #[test]
731 fn minify_whitespace_normalizes_indent_and_strips_trailing() {
732 let source = "fn foo() {\n let x = 1;\n let y = 2;\n}\n";
734 let result = minify_whitespace(source);
735 let lines: Vec<&str> = result.lines().collect();
736 assert_eq!(
737 lines[1], " let x = 1;",
738 "8-space indent should become 4-space"
739 );
740 assert_eq!(
741 lines[2], " let y = 2;",
742 "8-space indent should become 4-space"
743 );
744
745 let with_trailing = "fn bar() \n return 1; \n";
747 let result2 = minify_whitespace(with_trailing);
748 assert!(
749 result2.lines().all(|l| !l.ends_with(' ')),
750 "trailing whitespace should be stripped"
751 );
752
753 let with_blanks = "a\n\n\n\nb\n";
755 let result3 = minify_whitespace(with_blanks);
756 let blank_runs: Vec<usize> = {
758 let mut runs = Vec::new();
759 let mut count = 0usize;
760 for line in result3.lines() {
761 if line.is_empty() {
762 count += 1;
763 } else {
764 if count > 0 {
765 runs.push(count);
766 }
767 count = 0;
768 }
769 }
770 runs
771 };
772 assert!(
773 blank_runs.iter().all(|&n| n <= 1),
774 "3+ blank lines should collapse to 1, got runs: {blank_runs:?}"
775 );
776 }
777}