1use std::path::Path;
9use streaming_iterator::StreamingIterator;
10use tree_sitter::{Parser, QueryCursor};
11
12#[derive(Debug, Clone)]
17pub struct ChunkConfig {
18 pub max_chunk_bytes: usize,
22 pub window_size: usize,
26 pub window_overlap: usize,
30}
31
32impl Default for ChunkConfig {
33 fn default() -> Self {
34 Self {
35 max_chunk_bytes: 4096,
36 window_size: 2048,
37 window_overlap: 512,
38 }
39 }
40}
41
42#[derive(Debug, Clone, rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)]
44pub struct CodeChunk {
45 pub file_path: String,
47 pub name: String,
49 pub kind: String,
51 pub start_line: usize,
53 pub end_line: usize,
55 pub content: String,
57 pub enriched_content: String,
60}
61
62#[must_use]
69pub fn build_scope_chain(node: tree_sitter::Node<'_>, source: &str) -> String {
70 const CONTAINER_KINDS: &[&str] = &[
72 "impl_item",
74 "trait_item",
75 "mod_item",
76 "class_definition",
78 "module",
79 "class_declaration",
81 "type_declaration",
85 "namespace_definition",
87 "class_specifier",
88 ];
89
90 const NAME_FIELDS: &[&str] = &["name", "type"];
94
95 let mut parts = Vec::new();
96 let mut current = node.parent();
97 while let Some(parent) = current {
98 let kind = parent.kind();
99 if CONTAINER_KINDS.contains(&kind) {
100 let name = NAME_FIELDS
101 .iter()
102 .find_map(|field| parent.child_by_field_name(field))
103 .map_or(kind, |n| &source[n.start_byte()..n.end_byte()]);
104 parts.push(format!("{kind} {name}"));
105 }
106 current = parent.parent();
107 }
108 parts.reverse();
109 parts.join(" > ")
110}
111
112#[must_use]
118pub fn extract_signature(node: tree_sitter::Node<'_>, source: &str) -> Option<String> {
119 let name_node = node.child_by_field_name("name")?;
120 let body_node = node
121 .child_by_field_name("body")
122 .or_else(|| node.child_by_field_name("block"))?;
123 let start = name_node.start_byte();
124 let end = body_node.start_byte();
125 if start >= end {
126 return None;
127 }
128 let sig = source[start..end].trim();
129 if sig.is_empty() {
130 None
131 } else {
132 Some(sig.to_string())
133 }
134}
135
136#[must_use]
147pub fn minify_whitespace(source: &str) -> String {
148 let mut result = String::with_capacity(source.len());
149 let mut consecutive_blank = 0usize;
150
151 for line in source.lines() {
152 let leading = line
154 .chars()
155 .take_while(|c| *c == ' ' || *c == '\t')
156 .fold(0usize, |acc, c| acc + if c == '\t' { 2 } else { 1 });
157 let rest = line.trim_start();
158
159 if rest.is_empty() {
160 consecutive_blank += 1;
163 if consecutive_blank == 1 {
164 result.push('\n');
165 }
166 } else {
167 consecutive_blank = 0;
168 let indent_level = leading.div_ceil(2);
171 for _ in 0..indent_level {
172 result.push(' ');
173 }
174 result.push_str(rest.trim_end());
175 result.push('\n');
176 }
177 }
178
179 if !source.ends_with('\n') && result.ends_with('\n') {
181 result.pop();
182 }
183
184 result
185}
186
187fn build_enriched_content(
192 path: &Path,
193 node: tree_sitter::Node<'_>,
194 source: &str,
195 content: &str,
196 max_bytes: usize,
197) -> String {
198 let scope = build_scope_chain(node, source);
199 let sig = extract_signature(node, source).unwrap_or_default();
200 let rel_path = path.display().to_string();
201
202 let header = if scope.is_empty() && sig.is_empty() {
203 format!("// {rel_path}\n")
204 } else if scope.is_empty() {
205 format!("// {rel_path} | defines: {sig}\n")
206 } else if sig.is_empty() {
207 format!("// {rel_path} | {scope}\n")
208 } else {
209 format!("// {rel_path} | {scope} | defines: {sig}\n")
210 };
211
212 let minified = minify_whitespace(content);
215
216 if header.len() + minified.len() > max_bytes {
217 minified
218 } else {
219 format!("{header}{minified}")
220 }
221}
222
223#[must_use]
232pub fn chunk_file(
233 path: &Path,
234 source: &str,
235 config: &crate::languages::LangConfig,
236 chunk_config: &ChunkConfig,
237) -> Vec<CodeChunk> {
238 let mut parser = Parser::new();
239 if parser.set_language(&config.language).is_err() {
240 return sliding_windows(path, source, chunk_config);
241 }
242
243 let Some(tree) = parser.parse(source, None) else {
244 return sliding_windows(path, source, chunk_config);
245 };
246
247 let mut cursor = QueryCursor::new();
248 let mut chunks = Vec::new();
249 let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
250
251 while let Some(m) = matches.next() {
252 let mut name = String::new();
253 let mut def_node = None;
254 for cap in m.captures {
255 let cap_name = &config.query.capture_names()[cap.index as usize];
256 if *cap_name == "name" {
257 name = source[cap.node.start_byte()..cap.node.end_byte()].to_string();
258 } else if *cap_name == "def" {
259 def_node = Some(cap.node);
260 }
261 }
262 if let Some(node) = def_node {
263 let content = &source[node.start_byte()..node.end_byte()];
264 let start_line = node.start_position().row + 1;
265
266 if content.len() > chunk_config.max_chunk_bytes {
268 chunks.extend(sliding_windows_with_name(
269 path,
270 content,
271 &name,
272 start_line,
273 chunk_config,
274 ));
275 } else {
276 let enriched = build_enriched_content(
277 path,
278 node,
279 source,
280 content,
281 chunk_config.max_chunk_bytes,
282 );
283 chunks.push(CodeChunk {
284 file_path: path.display().to_string(),
285 name,
286 kind: node.kind().to_string(),
287 start_line,
288 end_line: node.end_position().row + 1,
289 enriched_content: enriched,
290 content: content.to_string(),
291 });
292 }
293 }
294 }
295
296 if chunks.is_empty() && !source.trim().is_empty() {
298 return sliding_windows(path, source, chunk_config);
299 }
300
301 chunks
302}
303
304#[must_use]
314pub fn chunk_text(path: &Path, source: &str, chunk_config: &ChunkConfig) -> Vec<CodeChunk> {
315 sliding_windows(path, source, chunk_config)
316}
317
318fn sliding_windows(path: &Path, source: &str, chunk_config: &ChunkConfig) -> Vec<CodeChunk> {
320 if source.trim().is_empty() {
321 return vec![];
322 }
323
324 if source.len() <= chunk_config.max_chunk_bytes {
326 let content = source.to_string();
327 return vec![CodeChunk {
328 file_path: path.display().to_string(),
329 name: path
330 .file_name()
331 .unwrap_or_default()
332 .to_string_lossy()
333 .to_string(),
334 kind: "file".to_string(),
335 start_line: 1,
336 end_line: source.lines().count(),
337 enriched_content: content.clone(),
338 content,
339 }];
340 }
341
342 let file_name = path
343 .file_name()
344 .unwrap_or_default()
345 .to_string_lossy()
346 .to_string();
347 sliding_window_chunks(source, path, &file_name, 1, chunk_config)
348}
349
350fn sliding_windows_with_name(
355 path: &Path,
356 content: &str,
357 name: &str,
358 base_line: usize,
359 chunk_config: &ChunkConfig,
360) -> Vec<CodeChunk> {
361 sliding_window_chunks(content, path, name, base_line, chunk_config)
362}
363
364fn sliding_window_chunks(
371 source: &str,
372 file_path: &Path,
373 name_prefix: &str,
374 base_line: usize,
375 chunk_config: &ChunkConfig,
376) -> Vec<CodeChunk> {
377 let step = chunk_config
378 .window_size
379 .saturating_sub(chunk_config.window_overlap)
380 .max(1);
381 let bytes = source.as_bytes();
382 let mut chunks = Vec::new();
383 let mut offset = 0;
384 let mut window_idx = 0;
385
386 while offset < bytes.len() {
387 let raw_end = (offset + chunk_config.window_size).min(bytes.len());
388
389 let end = if raw_end < bytes.len() {
391 match bytes[offset..raw_end].iter().rposition(|&b| b == b'\n') {
392 Some(pos) => offset + pos + 1,
393 None => raw_end, }
395 } else {
396 raw_end
397 };
398
399 if let Ok(window) = std::str::from_utf8(&bytes[offset..end])
401 && !window.trim().is_empty()
402 {
403 let start_line = base_line + source[..offset].matches('\n').count();
404 let content_lines = window.lines().count().max(1);
405 let end_line = start_line + content_lines - 1;
406 let content = window.to_string();
407 chunks.push(CodeChunk {
408 file_path: file_path.display().to_string(),
409 name: format!("{name_prefix}[{window_idx}]"),
410 kind: "window".to_string(),
411 start_line,
412 end_line,
413 enriched_content: content.clone(),
414 content,
415 });
416 window_idx += 1;
417 }
418
419 offset += step;
420 }
421
422 chunks
423}
424
425#[cfg(test)]
426mod tests {
427 use super::*;
428 use std::fmt::Write as _;
429 use std::path::Path;
430
431 #[test]
432 fn chunks_rust_functions_and_structs() {
433 let source = "fn hello() { println!(\"hi\"); }\nfn world() {}\nstruct Foo { x: i32 }";
434 let config = crate::languages::config_for_extension("rs").unwrap();
435 let chunks = chunk_file(
436 Path::new("test.rs"),
437 source,
438 &config,
439 &ChunkConfig::default(),
440 );
441 assert!(
442 chunks.len() >= 2,
443 "expected at least 2 chunks, got {}",
444 chunks.len()
445 );
446 assert!(chunks.iter().any(|c| c.name == "hello"));
447 assert!(chunks.iter().any(|c| c.name == "world"));
448 }
449
450 #[test]
451 fn chunks_python_functions_and_classes() {
452 let source = "def greet(name):\n pass\n\nclass Foo:\n pass\n";
453 let config = crate::languages::config_for_extension("py").unwrap();
454 let chunks = chunk_file(
455 Path::new("test.py"),
456 source,
457 &config,
458 &ChunkConfig::default(),
459 );
460 assert!(chunks.len() >= 2);
461 assert!(chunks.iter().any(|c| c.name == "greet"));
462 assert!(chunks.iter().any(|c| c.name == "Foo"));
463 }
464
465 #[test]
466 fn fallback_small_file_single_chunk() {
467 let source = "let x = 42;\nconsole.log(x);\n";
468 let config = crate::languages::config_for_extension("js").unwrap();
469 let chunks = chunk_file(
470 Path::new("script.js"),
471 source,
472 &config,
473 &ChunkConfig::default(),
474 );
475 assert_eq!(chunks.len(), 1);
476 assert_eq!(chunks[0].kind, "file");
477 }
478
479 #[test]
480 fn fallback_large_file_produces_windows() {
481 let line = "console.log('hello world, this is a long line of javascript code');\n";
483 let source: String = line.repeat(200); let chunk_config = ChunkConfig::default();
485 assert!(source.len() > chunk_config.max_chunk_bytes);
486
487 let config = crate::languages::config_for_extension("js").unwrap();
488 let chunks = chunk_file(Path::new("big.js"), &source, &config, &chunk_config);
489 assert!(
490 chunks.len() > 1,
491 "expected multiple windows, got {}",
492 chunks.len()
493 );
494 assert!(chunks.iter().all(|c| c.kind == "window"));
495 assert!(chunks[0].name.contains("[0]"));
496 }
497
498 #[test]
499 fn large_definition_is_windowed() {
500 let mut source = String::from("fn big_function() {\n");
502 for i in 0..200 {
503 writeln!(source, " let var_{i} = {i} * 2 + 1; // some computation").unwrap();
504 }
505 source.push_str("}\n");
506 let chunk_config = ChunkConfig::default();
507 assert!(source.len() > chunk_config.max_chunk_bytes);
508
509 let config = crate::languages::config_for_extension("rs").unwrap();
510 let chunks = chunk_file(Path::new("test.rs"), &source, &config, &chunk_config);
511 assert!(
512 chunks.len() > 1,
513 "expected windowed chunks, got {}",
514 chunks.len()
515 );
516 assert!(chunks[0].name.starts_with("big_function["));
517 }
518
519 #[test]
520 fn empty_file_produces_no_chunks() {
521 let config = crate::languages::config_for_extension("rs").unwrap();
522 let chunks = chunk_file(Path::new("empty.rs"), "", &config, &ChunkConfig::default());
523 assert!(chunks.is_empty());
524 }
525
526 fn first_def_node(
530 source: &str,
531 ext: &str,
532 ) -> (
533 tree_sitter::Tree,
534 std::sync::Arc<crate::languages::LangConfig>,
535 ) {
536 let config = crate::languages::config_for_extension(ext).unwrap();
537 let mut parser = Parser::new();
538 parser.set_language(&config.language).unwrap();
539 let tree = parser.parse(source, None).unwrap();
540 (tree, config)
541 }
542
543 #[test]
544 fn scope_chain_rust_impl_method() {
545 let source = "impl Foo {\n fn bar(&self) {}\n}";
546 let (tree, config) = first_def_node(source, "rs");
547 let mut cursor = QueryCursor::new();
548 let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
549
550 let mut def_node = None;
551 while let Some(m) = StreamingIterator::next(&mut matches) {
552 for cap in m.captures {
553 let cap_name = &config.query.capture_names()[cap.index as usize];
554 if *cap_name == "def" {
555 def_node = Some(cap.node);
556 }
557 }
558 }
559 let node = def_node.expect("should find a @def node");
560 let scope = build_scope_chain(node, source);
561 assert!(
562 scope.contains("impl_item"),
563 "scope should contain impl_item, got: {scope}"
564 );
565 assert!(
566 scope.contains("Foo"),
567 "scope should contain 'Foo', got: {scope}"
568 );
569 }
570
571 #[test]
572 fn scope_chain_python_class_method() {
573 let source = "class Greeter:\n def say_hello(self):\n pass\n";
574 let (tree, config) = first_def_node(source, "py");
575 let mut cursor = QueryCursor::new();
576 let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
577
578 let mut fn_node = None;
580 while let Some(m) = StreamingIterator::next(&mut matches) {
581 for cap in m.captures {
582 let cap_name = &config.query.capture_names()[cap.index as usize];
583 if *cap_name == "def" && cap.node.kind() == "function_definition" {
584 fn_node = Some(cap.node);
585 }
586 }
587 }
588 let node = fn_node.expect("should find say_hello @def node");
589 let scope = build_scope_chain(node, source);
590 assert!(
591 scope.contains("class_definition"),
592 "scope should contain class_definition, got: {scope}"
593 );
594 assert!(
595 scope.contains("Greeter"),
596 "scope should contain 'Greeter', got: {scope}"
597 );
598 }
599
600 #[test]
601 fn extract_signature_rust_function() {
602 let source = "fn greet(name: &str) -> String { name.to_string() }";
603 let (tree, config) = first_def_node(source, "rs");
604 let mut cursor = QueryCursor::new();
605 let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
606
607 let mut def_node = None;
608 while let Some(m) = StreamingIterator::next(&mut matches) {
609 for cap in m.captures {
610 let cap_name = &config.query.capture_names()[cap.index as usize];
611 if *cap_name == "def" {
612 def_node = Some(cap.node);
613 }
614 }
615 }
616 let node = def_node.expect("should find @def node");
617 let sig = extract_signature(node, source).expect("should extract signature");
618 assert!(
619 sig.contains("greet"),
620 "signature should contain 'greet', got: {sig}"
621 );
622 assert!(
623 sig.contains("name: &str"),
624 "signature should contain parameter, got: {sig}"
625 );
626 assert!(
627 sig.contains("-> String"),
628 "signature should contain return type, got: {sig}"
629 );
630 }
631
632 #[test]
633 fn enriched_content_has_header() {
634 let source = "fn hello() { println!(\"hi\"); }";
635 let config = crate::languages::config_for_extension("rs").unwrap();
636 let chunks = chunk_file(
637 Path::new("src/main.rs"),
638 source,
639 &config,
640 &ChunkConfig::default(),
641 );
642 assert!(!chunks.is_empty());
643 let chunk = &chunks[0];
644 assert!(
645 chunk.enriched_content.starts_with("//"),
646 "enriched_content should start with '//' header, got: {}",
647 &chunk.enriched_content[..chunk.enriched_content.len().min(80)]
648 );
649 assert!(
650 chunk.enriched_content.contains("src/main.rs"),
651 "enriched_content should contain file path"
652 );
653 assert!(
655 !chunk.content.starts_with("//"),
656 "raw content should not start with header"
657 );
658 }
659
660 #[test]
661 fn sliding_window_enriched_equals_content() {
662 let source = "let x = 42;\nconsole.log(x);\n";
663 let chunks = chunk_text(Path::new("test.txt"), source, &ChunkConfig::default());
664 assert!(!chunks.is_empty());
665 for chunk in &chunks {
666 assert_eq!(
667 chunk.enriched_content, chunk.content,
668 "sliding window chunks should have enriched_content == content"
669 );
670 }
671 }
672
673 #[test]
674 fn header_dropped_when_exceeding_max_bytes() {
675 let tiny_config = ChunkConfig {
678 max_chunk_bytes: 60,
679 window_size: 30,
680 window_overlap: 10,
681 };
682 let source = "fn f() { let x = 42; return x; }";
684 assert!(source.len() <= tiny_config.max_chunk_bytes);
685
686 let config = crate::languages::config_for_extension("rs").unwrap();
687 let chunks = chunk_file(
688 Path::new("long/path/to/file.rs"),
689 source,
690 &config,
691 &tiny_config,
692 );
693 assert!(!chunks.is_empty());
694 let chunk = &chunks[0];
695 assert!(
699 !chunk.enriched_content.starts_with("//"),
700 "header should be dropped when it would exceed max_chunk_bytes"
701 );
702 assert_eq!(chunk.content, source, "raw content should be unchanged");
703 }
704
705 #[test]
706 fn minify_whitespace_normalizes_indent_and_strips_trailing() {
707 let source = "fn foo() {\n let x = 1;\n let y = 2;\n}\n";
709 let result = minify_whitespace(source);
710 let lines: Vec<&str> = result.lines().collect();
711 assert_eq!(
712 lines[1], " let x = 1;",
713 "8-space indent should become 4-space"
714 );
715 assert_eq!(
716 lines[2], " let y = 2;",
717 "8-space indent should become 4-space"
718 );
719
720 let with_trailing = "fn bar() \n return 1; \n";
722 let result2 = minify_whitespace(with_trailing);
723 assert!(
724 result2.lines().all(|l| !l.ends_with(' ')),
725 "trailing whitespace should be stripped"
726 );
727
728 let with_blanks = "a\n\n\n\nb\n";
730 let result3 = minify_whitespace(with_blanks);
731 let blank_runs: Vec<usize> = {
733 let mut runs = Vec::new();
734 let mut count = 0usize;
735 for line in result3.lines() {
736 if line.is_empty() {
737 count += 1;
738 } else {
739 if count > 0 {
740 runs.push(count);
741 }
742 count = 0;
743 }
744 }
745 runs
746 };
747 assert!(
748 blank_runs.iter().all(|&n| n <= 1),
749 "3+ blank lines should collapse to 1, got runs: {blank_runs:?}"
750 );
751 }
752}