1use std::path::Path;
9use streaming_iterator::StreamingIterator;
10use tree_sitter::{Parser, QueryCursor};
11
12#[derive(
25 Debug,
26 Clone,
27 Copy,
28 Default,
29 PartialEq,
30 Eq,
31 Hash,
32 rkyv::Archive,
33 rkyv::Serialize,
34 rkyv::Deserialize,
35 bitcode::Encode,
36 bitcode::Decode,
37)]
38pub enum ContentKind {
39 #[default]
44 Code,
45 Docs,
47 Meta,
50}
51
52impl ContentKind {
53 #[must_use]
60 pub fn from_extension(ext: &str) -> Self {
61 match ext.to_ascii_lowercase().as_str() {
62 "md" | "rst" | "txt" | "adoc" | "asciidoc" | "org" => Self::Docs,
64 "json" | "yaml" | "yml" | "toml" | "xml" | "lock" | "snap" | "csv" | "tsv"
70 | "proto" | "rdf" | "owl" | "tfvars" => Self::Meta,
71 _ => Self::Code,
73 }
74 }
75}
76
77#[derive(Debug, Clone)]
82pub struct ChunkConfig {
83 pub max_chunk_bytes: usize,
87 pub window_size: usize,
91 pub window_overlap: usize,
95}
96
97impl Default for ChunkConfig {
98 fn default() -> Self {
99 Self {
100 max_chunk_bytes: 4096,
101 window_size: 2048,
102 window_overlap: 512,
103 }
104 }
105}
106
107#[derive(
109 Debug,
110 Clone,
111 rkyv::Archive,
112 rkyv::Serialize,
113 rkyv::Deserialize,
114 bitcode::Encode,
115 bitcode::Decode,
116)]
117pub struct CodeChunk {
118 pub file_path: String,
120 pub name: String,
122 pub kind: String,
129 pub content_kind: ContentKind,
135 pub start_line: usize,
138 pub end_line: usize,
140 pub symbol_line: usize,
153 pub content: String,
155 pub enriched_content: String,
158 pub qualified_name: Option<String>,
172}
173
174#[must_use]
181pub fn build_scope_chain(node: tree_sitter::Node<'_>, source: &str) -> String {
182 const CONTAINER_KINDS: &[&str] = &[
184 "impl_item",
186 "trait_item",
187 "mod_item",
188 "class_definition",
190 "module",
191 "class_declaration",
193 "type_declaration",
197 "namespace_definition",
199 "class_specifier",
200 ];
201
202 const NAME_FIELDS: &[&str] = &["name", "type"];
206
207 let mut parts = Vec::new();
208 let mut current = node.parent();
209 while let Some(parent) = current {
210 let kind = parent.kind();
211 if CONTAINER_KINDS.contains(&kind) {
212 let name = NAME_FIELDS
213 .iter()
214 .find_map(|field| parent.child_by_field_name(field))
215 .map_or(kind, |n| &source[n.start_byte()..n.end_byte()]);
216 parts.push(format!("{kind} {name}"));
217 }
218 current = parent.parent();
219 }
220 parts.reverse();
221 parts.join(" > ")
222}
223
224#[must_use]
230pub fn extract_signature(node: tree_sitter::Node<'_>, source: &str) -> Option<String> {
231 let name_node = node.child_by_field_name("name")?;
232 let body_node = node
233 .child_by_field_name("body")
234 .or_else(|| node.child_by_field_name("block"))?;
235 let start = name_node.start_byte();
236 let end = body_node.start_byte();
237 if start >= end {
238 return None;
239 }
240 let sig = source[start..end].trim();
241 if sig.is_empty() {
242 None
243 } else {
244 Some(sig.to_string())
245 }
246}
247
248#[must_use]
259pub fn minify_whitespace(source: &str) -> String {
260 let mut result = String::with_capacity(source.len());
261 let mut consecutive_blank = 0usize;
262
263 for line in source.lines() {
264 let leading = line
266 .chars()
267 .take_while(|c| *c == ' ' || *c == '\t')
268 .fold(0usize, |acc, c| acc + if c == '\t' { 2 } else { 1 });
269 let rest = line.trim_start();
270
271 if rest.is_empty() {
272 consecutive_blank += 1;
275 if consecutive_blank == 1 {
276 result.push('\n');
277 }
278 } else {
279 consecutive_blank = 0;
280 let indent_level = leading.div_ceil(2);
283 for _ in 0..indent_level {
284 result.push(' ');
285 }
286 result.push_str(rest.trim_end());
287 result.push('\n');
288 }
289 }
290
291 if !source.ends_with('\n') && result.ends_with('\n') {
293 result.pop();
294 }
295
296 result
297}
298
299fn build_enriched_content(
304 path: &Path,
305 node: tree_sitter::Node<'_>,
306 source: &str,
307 content: &str,
308 max_bytes: usize,
309) -> String {
310 let scope = build_scope_chain(node, source);
311 let sig = extract_signature(node, source).unwrap_or_default();
312 let rel_path = path.display().to_string();
313
314 let header = if scope.is_empty() && sig.is_empty() {
315 format!("// {rel_path}\n")
316 } else if scope.is_empty() {
317 format!("// {rel_path} | defines: {sig}\n")
318 } else if sig.is_empty() {
319 format!("// {rel_path} | {scope}\n")
320 } else {
321 format!("// {rel_path} | {scope} | defines: {sig}\n")
322 };
323
324 let minified = minify_whitespace(content);
327
328 if header.len() + minified.len() > max_bytes {
329 minified
330 } else {
331 format!("{header}{minified}")
332 }
333}
334
335#[must_use]
344pub fn chunk_file(
345 path: &Path,
346 source: &str,
347 config: &crate::languages::LangConfig,
348 chunk_config: &ChunkConfig,
349) -> Vec<CodeChunk> {
350 let mut parser = Parser::new();
351 if parser.set_language(&config.language).is_err() {
352 return sliding_windows(path, source, chunk_config);
353 }
354
355 let Some(tree) = parser.parse(source, None) else {
356 return sliding_windows(path, source, chunk_config);
357 };
358
359 let mut cursor = QueryCursor::new();
360 let mut chunks = Vec::new();
361 let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
362
363 while let Some(m) = matches.next() {
364 let mut name = String::new();
365 let mut name_line: Option<usize> = None;
368 let mut def_node = None;
369 for cap in m.captures {
370 let cap_name = &config.query.capture_names()[cap.index as usize];
371 if *cap_name == "name" {
372 name = source[cap.node.start_byte()..cap.node.end_byte()].to_string();
373 name_line = Some(cap.node.start_position().row + 1);
375 } else if *cap_name == "def" {
376 def_node = Some(cap.node);
377 }
378 }
379 if let Some(node) = def_node {
380 let content = &source[node.start_byte()..node.end_byte()];
381 let start_line = node.start_position().row + 1;
382 let symbol_line = name_line.unwrap_or(start_line);
386
387 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
391 let is_hcl = matches!(ext.to_ascii_lowercase().as_str(), "tf" | "tfvars" | "hcl");
392
393 if is_hcl && node.kind() == "attribute" {
401 continue;
402 }
403
404 let qualified_name = if is_hcl && node.kind() == "block" {
405 let composite = crate::languages::derive_hcl_block_name(&node, source.as_bytes());
406 if composite.is_empty() || composite == name {
407 None
408 } else {
409 Some(composite)
410 }
411 } else {
412 None
413 };
414
415 if is_hcl && node.kind() == "block" && name == "locals" {
420 chunks.extend(emit_hcl_local_attribute_chunks(
421 path,
422 source,
423 node,
424 chunk_config,
425 ));
426 }
427
428 if content.len() > chunk_config.max_chunk_bytes {
430 chunks.extend(sliding_windows_with_name(
431 path,
432 content,
433 &name,
434 start_line,
435 chunk_config,
436 ));
437 } else {
438 let enriched = build_enriched_content(
439 path,
440 node,
441 source,
442 content,
443 chunk_config.max_chunk_bytes,
444 );
445 chunks.push(CodeChunk {
446 file_path: path.display().to_string(),
447 name,
448 kind: node.kind().to_string(),
449 content_kind: ContentKind::from_extension(ext),
450 start_line,
451 end_line: node.end_position().row + 1,
452 symbol_line,
453 enriched_content: enriched,
454 content: content.to_string(),
455 qualified_name,
456 });
457 }
458 }
459 }
460
461 if chunks.is_empty() && !source.trim().is_empty() {
463 return sliding_windows(path, source, chunk_config);
464 }
465
466 chunks
467}
468
469fn emit_hcl_local_attribute_chunks(
483 path: &Path,
484 source: &str,
485 locals_block: tree_sitter::Node<'_>,
486 chunk_config: &ChunkConfig,
487) -> Vec<CodeChunk> {
488 let mut out = Vec::new();
489 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
490 let content_kind = ContentKind::from_extension(ext);
491
492 let mut cursor = locals_block.walk();
495 for child in locals_block.children(&mut cursor) {
496 if child.kind() != "body" {
497 continue;
498 }
499 let mut body_cursor = child.walk();
500 for attr in child.children(&mut body_cursor) {
501 if attr.kind() != "attribute" {
502 continue;
503 }
504 let mut name_node: Option<tree_sitter::Node<'_>> = None;
506 let mut attr_cursor = attr.walk();
507 for grandchild in attr.children(&mut attr_cursor) {
508 if grandchild.kind() == "identifier" {
509 name_node = Some(grandchild);
510 break;
511 }
512 }
513 let Some(id_node) = name_node else {
514 continue;
515 };
516 let name_text = source[id_node.start_byte()..id_node.end_byte()].to_string();
517 let content_text = source[attr.start_byte()..attr.end_byte()].to_string();
518 let start_line = attr.start_position().row + 1;
519 let end_line = attr.end_position().row + 1;
520 let symbol_line = id_node.start_position().row + 1;
521 let composite = format!("local.{name_text}");
522 let header = format!("// {} | local: {composite}\n", path.display());
523 let enriched = if header.len() + content_text.len() <= chunk_config.max_chunk_bytes {
524 format!("{header}{content_text}")
525 } else {
526 content_text.clone()
527 };
528 out.push(CodeChunk {
529 file_path: path.display().to_string(),
530 name: name_text,
531 kind: "local_attribute".to_string(),
532 content_kind,
533 start_line,
534 end_line,
535 symbol_line,
536 enriched_content: enriched,
537 content: content_text,
538 qualified_name: Some(composite),
539 });
540 }
541 }
542 out
543}
544
545#[must_use]
555pub fn chunk_text(path: &Path, source: &str, chunk_config: &ChunkConfig) -> Vec<CodeChunk> {
556 sliding_windows(path, source, chunk_config)
557}
558
559#[must_use]
561pub fn is_rdf_text_extension(ext: &str) -> bool {
562 matches!(
563 ext.to_ascii_lowercase().as_str(),
564 "ttl" | "nt" | "n3" | "trig" | "nq"
565 )
566}
567
568#[must_use]
575pub fn chunk_rdf_text(path: &Path, source: &str, chunk_config: &ChunkConfig) -> Vec<CodeChunk> {
576 if source.trim().is_empty() {
577 return vec![];
578 }
579
580 let mut chunks = Vec::new();
581 let mut current = String::new();
582 let mut current_start_line = 1usize;
583 let mut current_is_directive = false;
584
585 for (line_idx, line) in source.lines().enumerate() {
586 let line_no = line_idx + 1;
587 let trimmed = line.trim();
588 if trimmed.is_empty() {
589 flush_rdf_block(
590 path,
591 ¤t,
592 current_start_line,
593 chunk_config,
594 &mut chunks,
595 );
596 current.clear();
597 current_is_directive = false;
598 continue;
599 }
600
601 let line_is_directive = is_rdf_directive(trimmed);
602 if !current.is_empty() && current_is_directive && !line_is_directive {
603 flush_rdf_block(
604 path,
605 ¤t,
606 current_start_line,
607 chunk_config,
608 &mut chunks,
609 );
610 current.clear();
611 current_is_directive = false;
612 }
613
614 if current.is_empty() {
615 current_start_line = line_no;
616 current_is_directive = line_is_directive;
617 }
618 current.push_str(line);
619 current.push('\n');
620
621 if !current_is_directive && trimmed.ends_with('.') {
622 flush_rdf_block(
623 path,
624 ¤t,
625 current_start_line,
626 chunk_config,
627 &mut chunks,
628 );
629 current.clear();
630 current_is_directive = false;
631 }
632 }
633
634 flush_rdf_block(
635 path,
636 ¤t,
637 current_start_line,
638 chunk_config,
639 &mut chunks,
640 );
641 if chunks.is_empty() {
642 sliding_windows(path, source, chunk_config)
643 } else {
644 chunks
645 }
646}
647
648#[must_use]
650pub fn chunk_source_for_path(
651 path: &Path,
652 source: &str,
653 text_mode: bool,
654 chunk_config: &ChunkConfig,
655) -> Vec<CodeChunk> {
656 if text_mode {
657 return chunk_text(path, source, chunk_config);
658 }
659
660 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
661 if let Some(lang_config) = crate::languages::config_for_extension(ext) {
662 chunk_file(path, source, &lang_config, chunk_config)
663 } else if is_rdf_text_extension(ext) {
664 chunk_rdf_text(path, source, chunk_config)
665 } else {
666 chunk_text(path, source, chunk_config)
667 }
668}
669
670fn is_rdf_directive(trimmed: &str) -> bool {
671 trimmed.starts_with("@prefix")
672 || trimmed.starts_with("@base")
673 || trimmed.starts_with("PREFIX")
674 || trimmed.starts_with("BASE")
675}
676
677fn flush_rdf_block(
678 path: &Path,
679 content: &str,
680 start_line: usize,
681 chunk_config: &ChunkConfig,
682 chunks: &mut Vec<CodeChunk>,
683) {
684 let trimmed = content.trim();
685 if trimmed.is_empty() {
686 return;
687 }
688 let name = rdf_block_name(trimmed, path);
689 let content = format!("{trimmed}\n");
690 if content.len() > chunk_config.max_chunk_bytes {
691 chunks.extend(sliding_window_chunks(
692 &content,
693 path,
694 &name,
695 start_line,
696 chunk_config,
697 ));
698 return;
699 }
700 let header = format!("# {} | rdf: {name}\n", path.display());
701 let enriched_content = if header.len() + content.len() <= chunk_config.max_chunk_bytes {
702 format!("{header}{content}")
703 } else {
704 content.clone()
705 };
706 let line_count = content.lines().count().max(1);
707 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
708 chunks.push(CodeChunk {
709 file_path: path.display().to_string(),
710 name,
711 kind: "rdf_statements".to_string(),
712 content_kind: ContentKind::from_extension(ext),
713 start_line,
714 end_line: start_line + line_count - 1,
715 symbol_line: start_line,
717 enriched_content,
718 content,
719 qualified_name: None,
720 });
721}
722
723fn rdf_block_name(content: &str, path: &Path) -> String {
724 let first = content
725 .lines()
726 .map(str::trim)
727 .find(|line| !line.is_empty() && !line.starts_with('#'));
728 let Some(first) = first else {
729 return path
730 .file_name()
731 .unwrap_or_default()
732 .to_string_lossy()
733 .to_string();
734 };
735
736 if first.starts_with("@prefix") || first.starts_with("PREFIX") {
737 return "@prefix".to_string();
738 }
739 if first.starts_with("@base") || first.starts_with("BASE") {
740 return "@base".to_string();
741 }
742
743 let token = first
744 .split_whitespace()
745 .next()
746 .unwrap_or("")
747 .trim_end_matches([';', ',', '.']);
748 if token.is_empty() {
749 path.file_name()
750 .unwrap_or_default()
751 .to_string_lossy()
752 .to_string()
753 } else {
754 token.to_string()
755 }
756}
757
758fn sliding_windows(path: &Path, source: &str, chunk_config: &ChunkConfig) -> Vec<CodeChunk> {
760 if source.trim().is_empty() {
761 return vec![];
762 }
763
764 if source.len() <= chunk_config.max_chunk_bytes {
766 let content = source.to_string();
767 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
768 return vec![CodeChunk {
769 file_path: path.display().to_string(),
770 name: path
771 .file_name()
772 .unwrap_or_default()
773 .to_string_lossy()
774 .to_string(),
775 kind: "file".to_string(),
776 content_kind: ContentKind::from_extension(ext),
777 start_line: 1,
778 end_line: source.lines().count(),
779 symbol_line: 1,
781 enriched_content: content.clone(),
782 content,
783 qualified_name: None,
784 }];
785 }
786
787 let file_name = path
788 .file_name()
789 .unwrap_or_default()
790 .to_string_lossy()
791 .to_string();
792 sliding_window_chunks(source, path, &file_name, 1, chunk_config)
793}
794
795fn sliding_windows_with_name(
800 path: &Path,
801 content: &str,
802 name: &str,
803 base_line: usize,
804 chunk_config: &ChunkConfig,
805) -> Vec<CodeChunk> {
806 sliding_window_chunks(content, path, name, base_line, chunk_config)
807}
808
809fn sliding_window_chunks(
816 source: &str,
817 file_path: &Path,
818 name_prefix: &str,
819 base_line: usize,
820 chunk_config: &ChunkConfig,
821) -> Vec<CodeChunk> {
822 let step = chunk_config
823 .window_size
824 .saturating_sub(chunk_config.window_overlap)
825 .max(1);
826 let bytes = source.as_bytes();
827 let mut chunks = Vec::new();
828 let mut offset = 0;
829 let mut window_idx = 0;
830
831 while offset < bytes.len() {
832 let raw_end = (offset + chunk_config.window_size).min(bytes.len());
833
834 let end = if raw_end < bytes.len() {
836 match bytes[offset..raw_end].iter().rposition(|&b| b == b'\n') {
837 Some(pos) => offset + pos + 1,
838 None => raw_end, }
840 } else {
841 raw_end
842 };
843
844 if let Ok(window) = std::str::from_utf8(&bytes[offset..end])
846 && !window.trim().is_empty()
847 {
848 let start_line = base_line + source[..offset].matches('\n').count();
849 let content_lines = window.lines().count().max(1);
850 let end_line = start_line + content_lines - 1;
851 let content = window.to_string();
852 let ext = file_path.extension().and_then(|e| e.to_str()).unwrap_or("");
853 chunks.push(CodeChunk {
854 file_path: file_path.display().to_string(),
855 name: format!("{name_prefix}[{window_idx}]"),
856 kind: "window".to_string(),
857 content_kind: ContentKind::from_extension(ext),
858 start_line,
859 end_line,
860 symbol_line: start_line,
862 enriched_content: content.clone(),
863 content,
864 qualified_name: None,
865 });
866 window_idx += 1;
867 }
868
869 offset += step;
870 }
871
872 chunks
873}
874
875#[cfg(test)]
876mod tests {
877 use super::*;
878 use std::fmt::Write as _;
879 use std::path::Path;
880
881 #[test]
882 fn chunks_rust_functions_and_structs() {
883 let source = "fn hello() { println!(\"hi\"); }\nfn world() {}\nstruct Foo { x: i32 }";
884 let config = crate::languages::config_for_extension("rs").unwrap();
885 let chunks = chunk_file(
886 Path::new("test.rs"),
887 source,
888 &config,
889 &ChunkConfig::default(),
890 );
891 assert!(
892 chunks.len() >= 2,
893 "expected at least 2 chunks, got {}",
894 chunks.len()
895 );
896 assert!(chunks.iter().any(|c| c.name == "hello"));
897 assert!(chunks.iter().any(|c| c.name == "world"));
898 }
899
900 #[test]
901 fn chunks_python_functions_and_classes() {
902 let source = "def greet(name):\n pass\n\nclass Foo:\n pass\n";
903 let config = crate::languages::config_for_extension("py").unwrap();
904 let chunks = chunk_file(
905 Path::new("test.py"),
906 source,
907 &config,
908 &ChunkConfig::default(),
909 );
910 assert!(chunks.len() >= 2);
911 assert!(chunks.iter().any(|c| c.name == "greet"));
912 assert!(chunks.iter().any(|c| c.name == "Foo"));
913 }
914
915 #[test]
916 fn chunks_python_stub_functions_and_classes() {
917 let source = "from typing import Protocol\n\ndef greet(name: str) -> str: ...\n\nclass Foo(Protocol):\n value: int\n";
918 let config = crate::languages::config_for_extension("pyi").unwrap();
919 let chunks = chunk_file(
920 Path::new("test.pyi"),
921 source,
922 &config,
923 &ChunkConfig::default(),
924 );
925 assert!(chunks.len() >= 2);
926 assert!(chunks.iter().any(|c| c.name == "greet"));
927 assert!(chunks.iter().any(|c| c.name == "Foo"));
928 }
929
930 #[test]
931 fn fallback_small_file_single_chunk() {
932 let source = "// just a comment\n// and another\n";
935 let config = crate::languages::config_for_extension("js").unwrap();
936 let chunks = chunk_file(
937 Path::new("script.js"),
938 source,
939 &config,
940 &ChunkConfig::default(),
941 );
942 assert_eq!(chunks.len(), 1);
943 assert_eq!(chunks[0].kind, "file");
944 }
945
946 #[test]
947 fn fallback_large_file_produces_windows() {
948 let line = "console.log('hello world, this is a long line of javascript code');\n";
950 let source: String = line.repeat(200); let chunk_config = ChunkConfig::default();
952 assert!(source.len() > chunk_config.max_chunk_bytes);
953
954 let config = crate::languages::config_for_extension("js").unwrap();
955 let chunks = chunk_file(Path::new("big.js"), &source, &config, &chunk_config);
956 assert!(
957 chunks.len() > 1,
958 "expected multiple windows, got {}",
959 chunks.len()
960 );
961 assert!(chunks.iter().all(|c| c.kind == "window"));
962 assert!(chunks[0].name.contains("[0]"));
963 }
964
965 #[test]
966 fn large_definition_is_windowed() {
967 let mut source = String::from("fn big_function() {\n");
969 for i in 0..200 {
970 writeln!(source, " let var_{i} = {i} * 2 + 1; // some computation").unwrap();
971 }
972 source.push_str("}\n");
973 let chunk_config = ChunkConfig::default();
974 assert!(source.len() > chunk_config.max_chunk_bytes);
975
976 let config = crate::languages::config_for_extension("rs").unwrap();
977 let chunks = chunk_file(Path::new("test.rs"), &source, &config, &chunk_config);
978 assert!(
979 chunks.len() > 1,
980 "expected windowed chunks, got {}",
981 chunks.len()
982 );
983 assert!(chunks[0].name.starts_with("big_function["));
984 }
985
986 #[test]
987 fn empty_file_produces_no_chunks() {
988 let config = crate::languages::config_for_extension("rs").unwrap();
989 let chunks = chunk_file(Path::new("empty.rs"), "", &config, &ChunkConfig::default());
990 assert!(chunks.is_empty());
991 }
992
993 fn first_def_node(
997 source: &str,
998 ext: &str,
999 ) -> (
1000 tree_sitter::Tree,
1001 std::sync::Arc<crate::languages::LangConfig>,
1002 ) {
1003 let config = crate::languages::config_for_extension(ext).unwrap();
1004 let mut parser = Parser::new();
1005 parser.set_language(&config.language).unwrap();
1006 let tree = parser.parse(source, None).unwrap();
1007 (tree, config)
1008 }
1009
1010 #[test]
1011 fn scope_chain_rust_impl_method() {
1012 let source = "impl Foo {\n fn bar(&self) {}\n}";
1013 let (tree, config) = first_def_node(source, "rs");
1014 let mut cursor = QueryCursor::new();
1015 let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
1016
1017 let mut def_node = None;
1018 while let Some(m) = StreamingIterator::next(&mut matches) {
1019 for cap in m.captures {
1020 let cap_name = &config.query.capture_names()[cap.index as usize];
1021 if *cap_name == "def" {
1022 def_node = Some(cap.node);
1023 }
1024 }
1025 }
1026 let node = def_node.expect("should find a @def node");
1027 let scope = build_scope_chain(node, source);
1028 assert!(
1029 scope.contains("impl_item"),
1030 "scope should contain impl_item, got: {scope}"
1031 );
1032 assert!(
1033 scope.contains("Foo"),
1034 "scope should contain 'Foo', got: {scope}"
1035 );
1036 }
1037
1038 #[test]
1039 fn scope_chain_python_class_method() {
1040 let source = "class Greeter:\n def say_hello(self):\n pass\n";
1041 let (tree, config) = first_def_node(source, "py");
1042 let mut cursor = QueryCursor::new();
1043 let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
1044
1045 let mut fn_node = None;
1047 while let Some(m) = StreamingIterator::next(&mut matches) {
1048 for cap in m.captures {
1049 let cap_name = &config.query.capture_names()[cap.index as usize];
1050 if *cap_name == "def" && cap.node.kind() == "function_definition" {
1051 fn_node = Some(cap.node);
1052 }
1053 }
1054 }
1055 let node = fn_node.expect("should find say_hello @def node");
1056 let scope = build_scope_chain(node, source);
1057 assert!(
1058 scope.contains("class_definition"),
1059 "scope should contain class_definition, got: {scope}"
1060 );
1061 assert!(
1062 scope.contains("Greeter"),
1063 "scope should contain 'Greeter', got: {scope}"
1064 );
1065 }
1066
1067 #[test]
1068 fn extract_signature_rust_function() {
1069 let source = "fn greet(name: &str) -> String { name.to_string() }";
1070 let (tree, config) = first_def_node(source, "rs");
1071 let mut cursor = QueryCursor::new();
1072 let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
1073
1074 let mut def_node = None;
1075 while let Some(m) = StreamingIterator::next(&mut matches) {
1076 for cap in m.captures {
1077 let cap_name = &config.query.capture_names()[cap.index as usize];
1078 if *cap_name == "def" {
1079 def_node = Some(cap.node);
1080 }
1081 }
1082 }
1083 let node = def_node.expect("should find @def node");
1084 let sig = extract_signature(node, source).expect("should extract signature");
1085 assert!(
1086 sig.contains("greet"),
1087 "signature should contain 'greet', got: {sig}"
1088 );
1089 assert!(
1090 sig.contains("name: &str"),
1091 "signature should contain parameter, got: {sig}"
1092 );
1093 assert!(
1094 sig.contains("-> String"),
1095 "signature should contain return type, got: {sig}"
1096 );
1097 }
1098
1099 #[test]
1100 fn enriched_content_has_header() {
1101 let source = "fn hello() { println!(\"hi\"); }";
1102 let config = crate::languages::config_for_extension("rs").unwrap();
1103 let chunks = chunk_file(
1104 Path::new("src/main.rs"),
1105 source,
1106 &config,
1107 &ChunkConfig::default(),
1108 );
1109 assert!(!chunks.is_empty());
1110 let chunk = &chunks[0];
1111 assert!(
1112 chunk.enriched_content.starts_with("//"),
1113 "enriched_content should start with '//' header, got: {}",
1114 &chunk.enriched_content[..chunk.enriched_content.len().min(80)]
1115 );
1116 assert!(
1117 chunk.enriched_content.contains("src/main.rs"),
1118 "enriched_content should contain file path"
1119 );
1120 assert!(
1122 !chunk.content.starts_with("//"),
1123 "raw content should not start with header"
1124 );
1125 }
1126
1127 #[test]
1128 fn sliding_window_enriched_equals_content() {
1129 let source = "let x = 42;\nconsole.log(x);\n";
1130 let chunks = chunk_text(Path::new("test.txt"), source, &ChunkConfig::default());
1131 assert!(!chunks.is_empty());
1132 for chunk in &chunks {
1133 assert_eq!(
1134 chunk.enriched_content, chunk.content,
1135 "sliding window chunks should have enriched_content == content"
1136 );
1137 }
1138 }
1139
1140 #[test]
1141 fn chunks_rdf_xml_and_owl_elements_with_tree_sitter() {
1142 let source = r#"<?xml version="1.0"?>
1143<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
1144 xmlns:owl="http://www.w3.org/2002/07/owl#">
1145 <owl:Class rdf:about="http://example.com/Person"/>
1146 <owl:ObjectProperty rdf:about="http://example.com/knows"/>
1147</rdf:RDF>"#;
1148 let rdf_config = crate::languages::config_for_extension("rdf").unwrap();
1149 let owl_config = crate::languages::config_for_extension("owl").unwrap();
1150
1151 let rdf_chunks = chunk_file(
1152 Path::new("ontology.rdf"),
1153 source,
1154 &rdf_config,
1155 &ChunkConfig::default(),
1156 );
1157 let owl_chunks = chunk_file(
1158 Path::new("ontology.owl"),
1159 source,
1160 &owl_config,
1161 &ChunkConfig::default(),
1162 );
1163
1164 assert!(rdf_chunks.iter().any(|chunk| chunk.name == "owl:Class"));
1165 assert!(
1166 rdf_chunks
1167 .iter()
1168 .any(|chunk| chunk.name == "owl:ObjectProperty")
1169 );
1170 assert!(rdf_chunks.iter().all(|chunk| chunk.kind == "element"));
1171 assert!(owl_chunks.iter().any(|chunk| chunk.name == "owl:Class"));
1172 }
1173
1174 #[test]
1175 fn chunks_turtle_by_rdf_statement_blocks() {
1176 let source = r#"@prefix ex: <http://example.com/> .
1177@prefix owl: <http://www.w3.org/2002/07/owl#> .
1178
1179ex:Person
1180 a owl:Class ;
1181 ex:label "Person" .
1182
1183ex:knows
1184 a owl:ObjectProperty ;
1185 ex:domain ex:Person ;
1186 ex:range ex:Person .
1187"#;
1188
1189 let chunks = chunk_rdf_text(Path::new("ontology.ttl"), source, &ChunkConfig::default());
1190
1191 assert_eq!(chunks.len(), 3);
1192 assert_eq!(chunks[0].kind, "rdf_statements");
1193 assert_eq!(chunks[0].name, "@prefix");
1194 assert_eq!(chunks[1].name, "ex:Person");
1195 assert_eq!(chunks[2].name, "ex:knows");
1196 }
1197
1198 #[test]
1199 fn header_dropped_when_exceeding_max_bytes() {
1200 let tiny_config = ChunkConfig {
1203 max_chunk_bytes: 60,
1204 window_size: 30,
1205 window_overlap: 10,
1206 };
1207 let source = "fn f() { let x = 42; return x; }";
1209 assert!(source.len() <= tiny_config.max_chunk_bytes);
1210
1211 let config = crate::languages::config_for_extension("rs").unwrap();
1212 let chunks = chunk_file(
1213 Path::new("long/path/to/file.rs"),
1214 source,
1215 &config,
1216 &tiny_config,
1217 );
1218 assert!(!chunks.is_empty());
1219 let chunk = &chunks[0];
1220 assert!(
1224 !chunk.enriched_content.starts_with("//"),
1225 "header should be dropped when it would exceed max_chunk_bytes"
1226 );
1227 assert_eq!(chunk.content, source, "raw content should be unchanged");
1228 }
1229
1230 #[test]
1231 fn minify_whitespace_normalizes_indent_and_strips_trailing() {
1232 let source = "fn foo() {\n let x = 1;\n let y = 2;\n}\n";
1234 let result = minify_whitespace(source);
1235 let lines: Vec<&str> = result.lines().collect();
1236 assert_eq!(
1237 lines[1], " let x = 1;",
1238 "8-space indent should become 4-space"
1239 );
1240 assert_eq!(
1241 lines[2], " let y = 2;",
1242 "8-space indent should become 4-space"
1243 );
1244
1245 let with_trailing = "fn bar() \n return 1; \n";
1247 let result2 = minify_whitespace(with_trailing);
1248 assert!(
1249 result2.lines().all(|l| !l.ends_with(' ')),
1250 "trailing whitespace should be stripped"
1251 );
1252
1253 let with_blanks = "a\n\n\n\nb\n";
1255 let result3 = minify_whitespace(with_blanks);
1256 let blank_runs: Vec<usize> = {
1258 let mut runs = Vec::new();
1259 let mut count = 0usize;
1260 for line in result3.lines() {
1261 if line.is_empty() {
1262 count += 1;
1263 } else {
1264 if count > 0 {
1265 runs.push(count);
1266 }
1267 count = 0;
1268 }
1269 }
1270 runs
1271 };
1272 assert!(
1273 blank_runs.iter().all(|&n| n <= 1),
1274 "3+ blank lines should collapse to 1, got runs: {blank_runs:?}"
1275 );
1276 }
1277
1278 #[test]
1289 fn chunk_content_kind_code_for_rust_file() {
1290 let source = "fn hello() {}\n";
1291 let config = crate::languages::config_for_extension("rs").unwrap();
1292 let chunks = chunk_file(
1293 Path::new("src/lib.rs"),
1294 source,
1295 &config,
1296 &ChunkConfig::default(),
1297 );
1298 assert!(!chunks.is_empty(), "expected at least one chunk");
1299 assert!(
1300 chunks.iter().all(|c| c.content_kind == ContentKind::Code),
1301 "all chunks from a .rs file must have ContentKind::Code; got: {:?}",
1302 chunks.iter().map(|c| c.content_kind).collect::<Vec<_>>()
1303 );
1304 }
1305
1306 #[test]
1309 fn chunk_content_kind_meta_for_json_file() {
1310 let source = r#"{"key": "value", "answer": 42}"#;
1311 let config = crate::languages::config_for_extension("json").unwrap();
1312 let chunks = chunk_file(
1313 Path::new("data.json"),
1314 source,
1315 &config,
1316 &ChunkConfig::default(),
1317 );
1318 assert!(!chunks.is_empty(), "expected at least one chunk");
1319 assert!(
1320 chunks.iter().all(|c| c.content_kind == ContentKind::Meta),
1321 "all chunks from a .json file must have ContentKind::Meta; got: {:?}",
1322 chunks.iter().map(|c| c.content_kind).collect::<Vec<_>>()
1323 );
1324 }
1325
1326 #[test]
1329 fn chunk_content_kind_docs_for_md_file() {
1330 let source = "# Title\n\nSome prose content.\n";
1331 let config = crate::languages::config_for_extension("md").unwrap();
1332 let chunks = chunk_file(
1333 Path::new("README.md"),
1334 source,
1335 &config,
1336 &ChunkConfig::default(),
1337 );
1338 assert!(!chunks.is_empty(), "expected at least one chunk");
1339 assert!(
1340 chunks.iter().all(|c| c.content_kind == ContentKind::Docs),
1341 "all chunks from a .md file must have ContentKind::Docs; got: {:?}",
1342 chunks.iter().map(|c| c.content_kind).collect::<Vec<_>>()
1343 );
1344 }
1345
1346 #[test]
1349 fn chunk_content_kind_meta_for_yaml_toml_xml() {
1350 let yaml_source = "key: value\nanother: 42\n";
1352 let yaml_config = crate::languages::config_for_extension("yaml").unwrap();
1353 let yaml_chunks = chunk_file(
1354 Path::new("config.yaml"),
1355 yaml_source,
1356 &yaml_config,
1357 &ChunkConfig::default(),
1358 );
1359 assert!(!yaml_chunks.is_empty(), "expected yaml chunks");
1360 assert!(
1361 yaml_chunks
1362 .iter()
1363 .all(|c| c.content_kind == ContentKind::Meta),
1364 "yaml chunks must be Meta; got: {:?}",
1365 yaml_chunks
1366 .iter()
1367 .map(|c| c.content_kind)
1368 .collect::<Vec<_>>()
1369 );
1370
1371 let toml_source = "[section]\nkey = \"value\"\n";
1373 let toml_config = crate::languages::config_for_extension("toml").unwrap();
1374 let toml_chunks = chunk_file(
1375 Path::new("Cargo.toml"),
1376 toml_source,
1377 &toml_config,
1378 &ChunkConfig::default(),
1379 );
1380 assert!(!toml_chunks.is_empty(), "expected toml chunks");
1381 assert!(
1382 toml_chunks
1383 .iter()
1384 .all(|c| c.content_kind == ContentKind::Meta),
1385 "toml chunks must be Meta; got: {:?}",
1386 toml_chunks
1387 .iter()
1388 .map(|c| c.content_kind)
1389 .collect::<Vec<_>>()
1390 );
1391
1392 let xml_source = r#"<?xml version="1.0"?><root><item>hello</item></root>"#;
1394 let xml_config = crate::languages::config_for_extension("xml").unwrap();
1395 let xml_chunks = chunk_file(
1396 Path::new("data.xml"),
1397 xml_source,
1398 &xml_config,
1399 &ChunkConfig::default(),
1400 );
1401 assert!(!xml_chunks.is_empty(), "expected xml chunks");
1402 assert!(
1403 xml_chunks
1404 .iter()
1405 .all(|c| c.content_kind == ContentKind::Meta),
1406 "xml chunks must be Meta; got: {:?}",
1407 xml_chunks
1408 .iter()
1409 .map(|c| c.content_kind)
1410 .collect::<Vec<_>>()
1411 );
1412 }
1413
1414 #[test]
1418 fn content_kind_from_extension_covers_code_docs_meta() {
1419 for ext in [
1421 "rs", "py", "ts", "go", "java", "cpp", "sh", "rb", "kt", "swift", "scala",
1422 ] {
1423 assert_eq!(
1424 ContentKind::from_extension(ext),
1425 ContentKind::Code,
1426 ".{ext} should be Code"
1427 );
1428 }
1429 for ext in ["md", "rst", "txt", "adoc", "org"] {
1431 assert_eq!(
1432 ContentKind::from_extension(ext),
1433 ContentKind::Docs,
1434 ".{ext} should be Docs"
1435 );
1436 }
1437 for ext in [
1439 "json", "yaml", "yml", "toml", "xml", "lock", "snap", "csv", "tsv", "proto",
1440 ] {
1441 assert_eq!(
1442 ContentKind::from_extension(ext),
1443 ContentKind::Meta,
1444 ".{ext} should be Meta"
1445 );
1446 }
1447 }
1448
1449 #[test]
1452 fn sliding_window_chunks_carry_content_kind() {
1453 let source = "just some plain text with no tree-sitter grammar support\n";
1456 let chunks = chunk_source_for_path(
1457 Path::new("notes.txt"),
1458 source,
1459 false,
1460 &ChunkConfig::default(),
1461 );
1462 assert!(!chunks.is_empty(), "expected at least one chunk");
1463 assert!(
1464 chunks.iter().all(|c| c.content_kind == ContentKind::Docs),
1465 "notes.txt chunks must be Docs; got: {:?}",
1466 chunks.iter().map(|c| c.content_kind).collect::<Vec<_>>()
1467 );
1468 }
1469}