1use std::path::Path;
9use streaming_iterator::StreamingIterator;
10use tree_sitter::{Parser, QueryCursor};
11
12#[derive(
25 Debug,
26 Clone,
27 Copy,
28 Default,
29 PartialEq,
30 Eq,
31 Hash,
32 rkyv::Archive,
33 rkyv::Serialize,
34 rkyv::Deserialize,
35 bitcode::Encode,
36 bitcode::Decode,
37)]
38pub enum ContentKind {
39 #[default]
44 Code,
45 Docs,
47 Meta,
50}
51
52impl ContentKind {
53 #[must_use]
60 pub fn from_extension(ext: &str) -> Self {
61 match ext.to_ascii_lowercase().as_str() {
62 "md" | "rst" | "txt" | "adoc" | "asciidoc" | "org" => Self::Docs,
64 "json" | "yaml" | "yml" | "toml" | "xml" | "lock" | "snap" | "csv" | "tsv"
70 | "proto" | "rdf" | "owl" | "tfvars" => Self::Meta,
71 _ => Self::Code,
73 }
74 }
75}
76
77#[derive(Debug, Clone)]
82pub struct ChunkConfig {
83 pub max_chunk_bytes: usize,
87 pub window_size: usize,
91 pub window_overlap: usize,
95}
96
97impl Default for ChunkConfig {
98 fn default() -> Self {
99 Self {
100 max_chunk_bytes: 4096,
101 window_size: 2048,
102 window_overlap: 512,
103 }
104 }
105}
106
107#[derive(
109 Debug,
110 Clone,
111 rkyv::Archive,
112 rkyv::Serialize,
113 rkyv::Deserialize,
114 bitcode::Encode,
115 bitcode::Decode,
116)]
117pub struct CodeChunk {
118 pub file_path: String,
120 pub name: String,
122 pub kind: String,
129 pub content_kind: ContentKind,
135 pub start_line: usize,
138 pub end_line: usize,
140 pub symbol_line: usize,
153 pub content: String,
155 pub enriched_content: String,
158 pub qualified_name: Option<String>,
172}
173
174#[must_use]
181pub fn build_scope_chain(node: tree_sitter::Node<'_>, source: &str) -> String {
182 const CONTAINER_KINDS: &[&str] = &[
184 "impl_item",
186 "trait_item",
187 "mod_item",
188 "class_definition",
190 "module",
191 "class_declaration",
193 "type_declaration",
197 "namespace_definition",
199 "class_specifier",
200 ];
201
202 const NAME_FIELDS: &[&str] = &["name", "type"];
206
207 let mut parts = Vec::new();
208 let mut current = node.parent();
209 while let Some(parent) = current {
210 let kind = parent.kind();
211 if CONTAINER_KINDS.contains(&kind) {
212 let name = NAME_FIELDS
213 .iter()
214 .find_map(|field| parent.child_by_field_name(field))
215 .map_or(kind, |n| &source[n.start_byte()..n.end_byte()]);
216 parts.push(format!("{kind} {name}"));
217 }
218 current = parent.parent();
219 }
220 parts.reverse();
221 parts.join(" > ")
222}
223
224#[must_use]
230pub fn extract_signature(node: tree_sitter::Node<'_>, source: &str) -> Option<String> {
231 let name_node = node.child_by_field_name("name")?;
232 let body_node = node
233 .child_by_field_name("body")
234 .or_else(|| node.child_by_field_name("block"))?;
235 let start = name_node.start_byte();
236 let end = body_node.start_byte();
237 if start >= end {
238 return None;
239 }
240 let sig = source[start..end].trim();
241 if sig.is_empty() {
242 None
243 } else {
244 Some(sig.to_string())
245 }
246}
247
248#[must_use]
259pub fn minify_whitespace(source: &str) -> String {
260 let mut result = String::with_capacity(source.len());
261 let mut consecutive_blank = 0usize;
262
263 for line in source.lines() {
264 let leading = line
266 .chars()
267 .take_while(|c| *c == ' ' || *c == '\t')
268 .fold(0usize, |acc, c| acc + if c == '\t' { 2 } else { 1 });
269 let rest = line.trim_start();
270
271 if rest.is_empty() {
272 consecutive_blank += 1;
275 if consecutive_blank == 1 {
276 result.push('\n');
277 }
278 } else {
279 consecutive_blank = 0;
280 let indent_level = leading.div_ceil(2);
283 for _ in 0..indent_level {
284 result.push(' ');
285 }
286 result.push_str(rest.trim_end());
287 result.push('\n');
288 }
289 }
290
291 if !source.ends_with('\n') && result.ends_with('\n') {
293 result.pop();
294 }
295
296 result
297}
298
299fn build_enriched_content(
304 path: &Path,
305 node: tree_sitter::Node<'_>,
306 source: &str,
307 content: &str,
308 max_bytes: usize,
309) -> String {
310 let scope = build_scope_chain(node, source);
311 let sig = extract_signature(node, source).unwrap_or_default();
312 let rel_path = path.display().to_string();
313
314 let header = if scope.is_empty() && sig.is_empty() {
315 format!("// {rel_path}\n")
316 } else if scope.is_empty() {
317 format!("// {rel_path} | defines: {sig}\n")
318 } else if sig.is_empty() {
319 format!("// {rel_path} | {scope}\n")
320 } else {
321 format!("// {rel_path} | {scope} | defines: {sig}\n")
322 };
323
324 let minified = minify_whitespace(content);
327
328 if header.len() + minified.len() > max_bytes {
329 minified
330 } else {
331 format!("{header}{minified}")
332 }
333}
334
335#[must_use]
344pub fn chunk_file(
345 path: &Path,
346 source: &str,
347 config: &crate::languages::LangConfig,
348 chunk_config: &ChunkConfig,
349) -> Vec<CodeChunk> {
350 let mut parser = Parser::new();
351 if parser.set_language(&config.language).is_err() {
352 return sliding_windows(path, source, chunk_config);
353 }
354
355 let Some(tree) = parser.parse(source, None) else {
356 return sliding_windows(path, source, chunk_config);
357 };
358
359 let mut cursor = QueryCursor::new();
360 let mut chunks = Vec::new();
361 let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
362
363 while let Some(m) = matches.next() {
364 let mut name = String::new();
365 let mut name_line: Option<usize> = None;
368 let mut def_node = None;
369 for cap in m.captures {
370 let cap_name = &config.query.capture_names()[cap.index as usize];
371 if *cap_name == "name" {
372 name = source[cap.node.start_byte()..cap.node.end_byte()].to_string();
373 name_line = Some(cap.node.start_position().row + 1);
375 } else if *cap_name == "def" {
376 def_node = Some(cap.node);
377 }
378 }
379 if let Some(node) = def_node {
380 let content = &source[node.start_byte()..node.end_byte()];
381 let start_line = node.start_position().row + 1;
382 let symbol_line = name_line.unwrap_or(start_line);
386
387 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
391 let is_hcl = matches!(ext.to_ascii_lowercase().as_str(), "tf" | "tfvars" | "hcl");
392 let qualified_name = if is_hcl && node.kind() == "block" {
393 let composite = crate::languages::derive_hcl_block_name(&node, source.as_bytes());
394 if composite.is_empty() || composite == name {
395 None
396 } else {
397 Some(composite)
398 }
399 } else {
400 None
401 };
402
403 if is_hcl && node.kind() == "block" && name == "locals" {
408 chunks.extend(emit_hcl_local_attribute_chunks(
409 path,
410 source,
411 node,
412 chunk_config,
413 ));
414 }
415
416 if content.len() > chunk_config.max_chunk_bytes {
418 chunks.extend(sliding_windows_with_name(
419 path,
420 content,
421 &name,
422 start_line,
423 chunk_config,
424 ));
425 } else {
426 let enriched = build_enriched_content(
427 path,
428 node,
429 source,
430 content,
431 chunk_config.max_chunk_bytes,
432 );
433 chunks.push(CodeChunk {
434 file_path: path.display().to_string(),
435 name,
436 kind: node.kind().to_string(),
437 content_kind: ContentKind::from_extension(ext),
438 start_line,
439 end_line: node.end_position().row + 1,
440 symbol_line,
441 enriched_content: enriched,
442 content: content.to_string(),
443 qualified_name,
444 });
445 }
446 }
447 }
448
449 if chunks.is_empty() && !source.trim().is_empty() {
451 return sliding_windows(path, source, chunk_config);
452 }
453
454 chunks
455}
456
457fn emit_hcl_local_attribute_chunks(
471 path: &Path,
472 source: &str,
473 locals_block: tree_sitter::Node<'_>,
474 chunk_config: &ChunkConfig,
475) -> Vec<CodeChunk> {
476 let mut out = Vec::new();
477 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
478 let content_kind = ContentKind::from_extension(ext);
479
480 let mut cursor = locals_block.walk();
483 for child in locals_block.children(&mut cursor) {
484 if child.kind() != "body" {
485 continue;
486 }
487 let mut body_cursor = child.walk();
488 for attr in child.children(&mut body_cursor) {
489 if attr.kind() != "attribute" {
490 continue;
491 }
492 let mut name_node: Option<tree_sitter::Node<'_>> = None;
494 let mut attr_cursor = attr.walk();
495 for grandchild in attr.children(&mut attr_cursor) {
496 if grandchild.kind() == "identifier" {
497 name_node = Some(grandchild);
498 break;
499 }
500 }
501 let Some(id_node) = name_node else {
502 continue;
503 };
504 let name_text = source[id_node.start_byte()..id_node.end_byte()].to_string();
505 let content_text = source[attr.start_byte()..attr.end_byte()].to_string();
506 let start_line = attr.start_position().row + 1;
507 let end_line = attr.end_position().row + 1;
508 let symbol_line = id_node.start_position().row + 1;
509 let composite = format!("local.{name_text}");
510 let header = format!("// {} | local: {composite}\n", path.display());
511 let enriched = if header.len() + content_text.len() <= chunk_config.max_chunk_bytes {
512 format!("{header}{content_text}")
513 } else {
514 content_text.clone()
515 };
516 out.push(CodeChunk {
517 file_path: path.display().to_string(),
518 name: name_text,
519 kind: "local_attribute".to_string(),
520 content_kind,
521 start_line,
522 end_line,
523 symbol_line,
524 enriched_content: enriched,
525 content: content_text,
526 qualified_name: Some(composite),
527 });
528 }
529 }
530 out
531}
532
533#[must_use]
543pub fn chunk_text(path: &Path, source: &str, chunk_config: &ChunkConfig) -> Vec<CodeChunk> {
544 sliding_windows(path, source, chunk_config)
545}
546
547#[must_use]
549pub fn is_rdf_text_extension(ext: &str) -> bool {
550 matches!(
551 ext.to_ascii_lowercase().as_str(),
552 "ttl" | "nt" | "n3" | "trig" | "nq"
553 )
554}
555
556#[must_use]
563pub fn chunk_rdf_text(path: &Path, source: &str, chunk_config: &ChunkConfig) -> Vec<CodeChunk> {
564 if source.trim().is_empty() {
565 return vec![];
566 }
567
568 let mut chunks = Vec::new();
569 let mut current = String::new();
570 let mut current_start_line = 1usize;
571 let mut current_is_directive = false;
572
573 for (line_idx, line) in source.lines().enumerate() {
574 let line_no = line_idx + 1;
575 let trimmed = line.trim();
576 if trimmed.is_empty() {
577 flush_rdf_block(
578 path,
579 ¤t,
580 current_start_line,
581 chunk_config,
582 &mut chunks,
583 );
584 current.clear();
585 current_is_directive = false;
586 continue;
587 }
588
589 let line_is_directive = is_rdf_directive(trimmed);
590 if !current.is_empty() && current_is_directive && !line_is_directive {
591 flush_rdf_block(
592 path,
593 ¤t,
594 current_start_line,
595 chunk_config,
596 &mut chunks,
597 );
598 current.clear();
599 current_is_directive = false;
600 }
601
602 if current.is_empty() {
603 current_start_line = line_no;
604 current_is_directive = line_is_directive;
605 }
606 current.push_str(line);
607 current.push('\n');
608
609 if !current_is_directive && trimmed.ends_with('.') {
610 flush_rdf_block(
611 path,
612 ¤t,
613 current_start_line,
614 chunk_config,
615 &mut chunks,
616 );
617 current.clear();
618 current_is_directive = false;
619 }
620 }
621
622 flush_rdf_block(
623 path,
624 ¤t,
625 current_start_line,
626 chunk_config,
627 &mut chunks,
628 );
629 if chunks.is_empty() {
630 sliding_windows(path, source, chunk_config)
631 } else {
632 chunks
633 }
634}
635
636#[must_use]
638pub fn chunk_source_for_path(
639 path: &Path,
640 source: &str,
641 text_mode: bool,
642 chunk_config: &ChunkConfig,
643) -> Vec<CodeChunk> {
644 if text_mode {
645 return chunk_text(path, source, chunk_config);
646 }
647
648 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
649 if let Some(lang_config) = crate::languages::config_for_extension(ext) {
650 chunk_file(path, source, &lang_config, chunk_config)
651 } else if is_rdf_text_extension(ext) {
652 chunk_rdf_text(path, source, chunk_config)
653 } else {
654 chunk_text(path, source, chunk_config)
655 }
656}
657
658fn is_rdf_directive(trimmed: &str) -> bool {
659 trimmed.starts_with("@prefix")
660 || trimmed.starts_with("@base")
661 || trimmed.starts_with("PREFIX")
662 || trimmed.starts_with("BASE")
663}
664
665fn flush_rdf_block(
666 path: &Path,
667 content: &str,
668 start_line: usize,
669 chunk_config: &ChunkConfig,
670 chunks: &mut Vec<CodeChunk>,
671) {
672 let trimmed = content.trim();
673 if trimmed.is_empty() {
674 return;
675 }
676 let name = rdf_block_name(trimmed, path);
677 let content = format!("{trimmed}\n");
678 if content.len() > chunk_config.max_chunk_bytes {
679 chunks.extend(sliding_window_chunks(
680 &content,
681 path,
682 &name,
683 start_line,
684 chunk_config,
685 ));
686 return;
687 }
688 let header = format!("# {} | rdf: {name}\n", path.display());
689 let enriched_content = if header.len() + content.len() <= chunk_config.max_chunk_bytes {
690 format!("{header}{content}")
691 } else {
692 content.clone()
693 };
694 let line_count = content.lines().count().max(1);
695 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
696 chunks.push(CodeChunk {
697 file_path: path.display().to_string(),
698 name,
699 kind: "rdf_statements".to_string(),
700 content_kind: ContentKind::from_extension(ext),
701 start_line,
702 end_line: start_line + line_count - 1,
703 symbol_line: start_line,
705 enriched_content,
706 content,
707 qualified_name: None,
708 });
709}
710
711fn rdf_block_name(content: &str, path: &Path) -> String {
712 let first = content
713 .lines()
714 .map(str::trim)
715 .find(|line| !line.is_empty() && !line.starts_with('#'));
716 let Some(first) = first else {
717 return path
718 .file_name()
719 .unwrap_or_default()
720 .to_string_lossy()
721 .to_string();
722 };
723
724 if first.starts_with("@prefix") || first.starts_with("PREFIX") {
725 return "@prefix".to_string();
726 }
727 if first.starts_with("@base") || first.starts_with("BASE") {
728 return "@base".to_string();
729 }
730
731 let token = first
732 .split_whitespace()
733 .next()
734 .unwrap_or("")
735 .trim_end_matches([';', ',', '.']);
736 if token.is_empty() {
737 path.file_name()
738 .unwrap_or_default()
739 .to_string_lossy()
740 .to_string()
741 } else {
742 token.to_string()
743 }
744}
745
746fn sliding_windows(path: &Path, source: &str, chunk_config: &ChunkConfig) -> Vec<CodeChunk> {
748 if source.trim().is_empty() {
749 return vec![];
750 }
751
752 if source.len() <= chunk_config.max_chunk_bytes {
754 let content = source.to_string();
755 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
756 return vec![CodeChunk {
757 file_path: path.display().to_string(),
758 name: path
759 .file_name()
760 .unwrap_or_default()
761 .to_string_lossy()
762 .to_string(),
763 kind: "file".to_string(),
764 content_kind: ContentKind::from_extension(ext),
765 start_line: 1,
766 end_line: source.lines().count(),
767 symbol_line: 1,
769 enriched_content: content.clone(),
770 content,
771 qualified_name: None,
772 }];
773 }
774
775 let file_name = path
776 .file_name()
777 .unwrap_or_default()
778 .to_string_lossy()
779 .to_string();
780 sliding_window_chunks(source, path, &file_name, 1, chunk_config)
781}
782
783fn sliding_windows_with_name(
788 path: &Path,
789 content: &str,
790 name: &str,
791 base_line: usize,
792 chunk_config: &ChunkConfig,
793) -> Vec<CodeChunk> {
794 sliding_window_chunks(content, path, name, base_line, chunk_config)
795}
796
797fn sliding_window_chunks(
804 source: &str,
805 file_path: &Path,
806 name_prefix: &str,
807 base_line: usize,
808 chunk_config: &ChunkConfig,
809) -> Vec<CodeChunk> {
810 let step = chunk_config
811 .window_size
812 .saturating_sub(chunk_config.window_overlap)
813 .max(1);
814 let bytes = source.as_bytes();
815 let mut chunks = Vec::new();
816 let mut offset = 0;
817 let mut window_idx = 0;
818
819 while offset < bytes.len() {
820 let raw_end = (offset + chunk_config.window_size).min(bytes.len());
821
822 let end = if raw_end < bytes.len() {
824 match bytes[offset..raw_end].iter().rposition(|&b| b == b'\n') {
825 Some(pos) => offset + pos + 1,
826 None => raw_end, }
828 } else {
829 raw_end
830 };
831
832 if let Ok(window) = std::str::from_utf8(&bytes[offset..end])
834 && !window.trim().is_empty()
835 {
836 let start_line = base_line + source[..offset].matches('\n').count();
837 let content_lines = window.lines().count().max(1);
838 let end_line = start_line + content_lines - 1;
839 let content = window.to_string();
840 let ext = file_path.extension().and_then(|e| e.to_str()).unwrap_or("");
841 chunks.push(CodeChunk {
842 file_path: file_path.display().to_string(),
843 name: format!("{name_prefix}[{window_idx}]"),
844 kind: "window".to_string(),
845 content_kind: ContentKind::from_extension(ext),
846 start_line,
847 end_line,
848 symbol_line: start_line,
850 enriched_content: content.clone(),
851 content,
852 qualified_name: None,
853 });
854 window_idx += 1;
855 }
856
857 offset += step;
858 }
859
860 chunks
861}
862
863#[cfg(test)]
864mod tests {
865 use super::*;
866 use std::fmt::Write as _;
867 use std::path::Path;
868
869 #[test]
870 fn chunks_rust_functions_and_structs() {
871 let source = "fn hello() { println!(\"hi\"); }\nfn world() {}\nstruct Foo { x: i32 }";
872 let config = crate::languages::config_for_extension("rs").unwrap();
873 let chunks = chunk_file(
874 Path::new("test.rs"),
875 source,
876 &config,
877 &ChunkConfig::default(),
878 );
879 assert!(
880 chunks.len() >= 2,
881 "expected at least 2 chunks, got {}",
882 chunks.len()
883 );
884 assert!(chunks.iter().any(|c| c.name == "hello"));
885 assert!(chunks.iter().any(|c| c.name == "world"));
886 }
887
888 #[test]
889 fn chunks_python_functions_and_classes() {
890 let source = "def greet(name):\n pass\n\nclass Foo:\n pass\n";
891 let config = crate::languages::config_for_extension("py").unwrap();
892 let chunks = chunk_file(
893 Path::new("test.py"),
894 source,
895 &config,
896 &ChunkConfig::default(),
897 );
898 assert!(chunks.len() >= 2);
899 assert!(chunks.iter().any(|c| c.name == "greet"));
900 assert!(chunks.iter().any(|c| c.name == "Foo"));
901 }
902
903 #[test]
904 fn chunks_python_stub_functions_and_classes() {
905 let source = "from typing import Protocol\n\ndef greet(name: str) -> str: ...\n\nclass Foo(Protocol):\n value: int\n";
906 let config = crate::languages::config_for_extension("pyi").unwrap();
907 let chunks = chunk_file(
908 Path::new("test.pyi"),
909 source,
910 &config,
911 &ChunkConfig::default(),
912 );
913 assert!(chunks.len() >= 2);
914 assert!(chunks.iter().any(|c| c.name == "greet"));
915 assert!(chunks.iter().any(|c| c.name == "Foo"));
916 }
917
918 #[test]
919 fn fallback_small_file_single_chunk() {
920 let source = "// just a comment\n// and another\n";
923 let config = crate::languages::config_for_extension("js").unwrap();
924 let chunks = chunk_file(
925 Path::new("script.js"),
926 source,
927 &config,
928 &ChunkConfig::default(),
929 );
930 assert_eq!(chunks.len(), 1);
931 assert_eq!(chunks[0].kind, "file");
932 }
933
934 #[test]
935 fn fallback_large_file_produces_windows() {
936 let line = "console.log('hello world, this is a long line of javascript code');\n";
938 let source: String = line.repeat(200); let chunk_config = ChunkConfig::default();
940 assert!(source.len() > chunk_config.max_chunk_bytes);
941
942 let config = crate::languages::config_for_extension("js").unwrap();
943 let chunks = chunk_file(Path::new("big.js"), &source, &config, &chunk_config);
944 assert!(
945 chunks.len() > 1,
946 "expected multiple windows, got {}",
947 chunks.len()
948 );
949 assert!(chunks.iter().all(|c| c.kind == "window"));
950 assert!(chunks[0].name.contains("[0]"));
951 }
952
953 #[test]
954 fn large_definition_is_windowed() {
955 let mut source = String::from("fn big_function() {\n");
957 for i in 0..200 {
958 writeln!(source, " let var_{i} = {i} * 2 + 1; // some computation").unwrap();
959 }
960 source.push_str("}\n");
961 let chunk_config = ChunkConfig::default();
962 assert!(source.len() > chunk_config.max_chunk_bytes);
963
964 let config = crate::languages::config_for_extension("rs").unwrap();
965 let chunks = chunk_file(Path::new("test.rs"), &source, &config, &chunk_config);
966 assert!(
967 chunks.len() > 1,
968 "expected windowed chunks, got {}",
969 chunks.len()
970 );
971 assert!(chunks[0].name.starts_with("big_function["));
972 }
973
974 #[test]
975 fn empty_file_produces_no_chunks() {
976 let config = crate::languages::config_for_extension("rs").unwrap();
977 let chunks = chunk_file(Path::new("empty.rs"), "", &config, &ChunkConfig::default());
978 assert!(chunks.is_empty());
979 }
980
981 fn first_def_node(
985 source: &str,
986 ext: &str,
987 ) -> (
988 tree_sitter::Tree,
989 std::sync::Arc<crate::languages::LangConfig>,
990 ) {
991 let config = crate::languages::config_for_extension(ext).unwrap();
992 let mut parser = Parser::new();
993 parser.set_language(&config.language).unwrap();
994 let tree = parser.parse(source, None).unwrap();
995 (tree, config)
996 }
997
998 #[test]
999 fn scope_chain_rust_impl_method() {
1000 let source = "impl Foo {\n fn bar(&self) {}\n}";
1001 let (tree, config) = first_def_node(source, "rs");
1002 let mut cursor = QueryCursor::new();
1003 let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
1004
1005 let mut def_node = None;
1006 while let Some(m) = StreamingIterator::next(&mut matches) {
1007 for cap in m.captures {
1008 let cap_name = &config.query.capture_names()[cap.index as usize];
1009 if *cap_name == "def" {
1010 def_node = Some(cap.node);
1011 }
1012 }
1013 }
1014 let node = def_node.expect("should find a @def node");
1015 let scope = build_scope_chain(node, source);
1016 assert!(
1017 scope.contains("impl_item"),
1018 "scope should contain impl_item, got: {scope}"
1019 );
1020 assert!(
1021 scope.contains("Foo"),
1022 "scope should contain 'Foo', got: {scope}"
1023 );
1024 }
1025
1026 #[test]
1027 fn scope_chain_python_class_method() {
1028 let source = "class Greeter:\n def say_hello(self):\n pass\n";
1029 let (tree, config) = first_def_node(source, "py");
1030 let mut cursor = QueryCursor::new();
1031 let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
1032
1033 let mut fn_node = None;
1035 while let Some(m) = StreamingIterator::next(&mut matches) {
1036 for cap in m.captures {
1037 let cap_name = &config.query.capture_names()[cap.index as usize];
1038 if *cap_name == "def" && cap.node.kind() == "function_definition" {
1039 fn_node = Some(cap.node);
1040 }
1041 }
1042 }
1043 let node = fn_node.expect("should find say_hello @def node");
1044 let scope = build_scope_chain(node, source);
1045 assert!(
1046 scope.contains("class_definition"),
1047 "scope should contain class_definition, got: {scope}"
1048 );
1049 assert!(
1050 scope.contains("Greeter"),
1051 "scope should contain 'Greeter', got: {scope}"
1052 );
1053 }
1054
1055 #[test]
1056 fn extract_signature_rust_function() {
1057 let source = "fn greet(name: &str) -> String { name.to_string() }";
1058 let (tree, config) = first_def_node(source, "rs");
1059 let mut cursor = QueryCursor::new();
1060 let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
1061
1062 let mut def_node = None;
1063 while let Some(m) = StreamingIterator::next(&mut matches) {
1064 for cap in m.captures {
1065 let cap_name = &config.query.capture_names()[cap.index as usize];
1066 if *cap_name == "def" {
1067 def_node = Some(cap.node);
1068 }
1069 }
1070 }
1071 let node = def_node.expect("should find @def node");
1072 let sig = extract_signature(node, source).expect("should extract signature");
1073 assert!(
1074 sig.contains("greet"),
1075 "signature should contain 'greet', got: {sig}"
1076 );
1077 assert!(
1078 sig.contains("name: &str"),
1079 "signature should contain parameter, got: {sig}"
1080 );
1081 assert!(
1082 sig.contains("-> String"),
1083 "signature should contain return type, got: {sig}"
1084 );
1085 }
1086
1087 #[test]
1088 fn enriched_content_has_header() {
1089 let source = "fn hello() { println!(\"hi\"); }";
1090 let config = crate::languages::config_for_extension("rs").unwrap();
1091 let chunks = chunk_file(
1092 Path::new("src/main.rs"),
1093 source,
1094 &config,
1095 &ChunkConfig::default(),
1096 );
1097 assert!(!chunks.is_empty());
1098 let chunk = &chunks[0];
1099 assert!(
1100 chunk.enriched_content.starts_with("//"),
1101 "enriched_content should start with '//' header, got: {}",
1102 &chunk.enriched_content[..chunk.enriched_content.len().min(80)]
1103 );
1104 assert!(
1105 chunk.enriched_content.contains("src/main.rs"),
1106 "enriched_content should contain file path"
1107 );
1108 assert!(
1110 !chunk.content.starts_with("//"),
1111 "raw content should not start with header"
1112 );
1113 }
1114
1115 #[test]
1116 fn sliding_window_enriched_equals_content() {
1117 let source = "let x = 42;\nconsole.log(x);\n";
1118 let chunks = chunk_text(Path::new("test.txt"), source, &ChunkConfig::default());
1119 assert!(!chunks.is_empty());
1120 for chunk in &chunks {
1121 assert_eq!(
1122 chunk.enriched_content, chunk.content,
1123 "sliding window chunks should have enriched_content == content"
1124 );
1125 }
1126 }
1127
1128 #[test]
1129 fn chunks_rdf_xml_and_owl_elements_with_tree_sitter() {
1130 let source = r#"<?xml version="1.0"?>
1131<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
1132 xmlns:owl="http://www.w3.org/2002/07/owl#">
1133 <owl:Class rdf:about="http://example.com/Person"/>
1134 <owl:ObjectProperty rdf:about="http://example.com/knows"/>
1135</rdf:RDF>"#;
1136 let rdf_config = crate::languages::config_for_extension("rdf").unwrap();
1137 let owl_config = crate::languages::config_for_extension("owl").unwrap();
1138
1139 let rdf_chunks = chunk_file(
1140 Path::new("ontology.rdf"),
1141 source,
1142 &rdf_config,
1143 &ChunkConfig::default(),
1144 );
1145 let owl_chunks = chunk_file(
1146 Path::new("ontology.owl"),
1147 source,
1148 &owl_config,
1149 &ChunkConfig::default(),
1150 );
1151
1152 assert!(rdf_chunks.iter().any(|chunk| chunk.name == "owl:Class"));
1153 assert!(
1154 rdf_chunks
1155 .iter()
1156 .any(|chunk| chunk.name == "owl:ObjectProperty")
1157 );
1158 assert!(rdf_chunks.iter().all(|chunk| chunk.kind == "element"));
1159 assert!(owl_chunks.iter().any(|chunk| chunk.name == "owl:Class"));
1160 }
1161
1162 #[test]
1163 fn chunks_turtle_by_rdf_statement_blocks() {
1164 let source = r#"@prefix ex: <http://example.com/> .
1165@prefix owl: <http://www.w3.org/2002/07/owl#> .
1166
1167ex:Person
1168 a owl:Class ;
1169 ex:label "Person" .
1170
1171ex:knows
1172 a owl:ObjectProperty ;
1173 ex:domain ex:Person ;
1174 ex:range ex:Person .
1175"#;
1176
1177 let chunks = chunk_rdf_text(Path::new("ontology.ttl"), source, &ChunkConfig::default());
1178
1179 assert_eq!(chunks.len(), 3);
1180 assert_eq!(chunks[0].kind, "rdf_statements");
1181 assert_eq!(chunks[0].name, "@prefix");
1182 assert_eq!(chunks[1].name, "ex:Person");
1183 assert_eq!(chunks[2].name, "ex:knows");
1184 }
1185
1186 #[test]
1187 fn header_dropped_when_exceeding_max_bytes() {
1188 let tiny_config = ChunkConfig {
1191 max_chunk_bytes: 60,
1192 window_size: 30,
1193 window_overlap: 10,
1194 };
1195 let source = "fn f() { let x = 42; return x; }";
1197 assert!(source.len() <= tiny_config.max_chunk_bytes);
1198
1199 let config = crate::languages::config_for_extension("rs").unwrap();
1200 let chunks = chunk_file(
1201 Path::new("long/path/to/file.rs"),
1202 source,
1203 &config,
1204 &tiny_config,
1205 );
1206 assert!(!chunks.is_empty());
1207 let chunk = &chunks[0];
1208 assert!(
1212 !chunk.enriched_content.starts_with("//"),
1213 "header should be dropped when it would exceed max_chunk_bytes"
1214 );
1215 assert_eq!(chunk.content, source, "raw content should be unchanged");
1216 }
1217
1218 #[test]
1219 fn minify_whitespace_normalizes_indent_and_strips_trailing() {
1220 let source = "fn foo() {\n let x = 1;\n let y = 2;\n}\n";
1222 let result = minify_whitespace(source);
1223 let lines: Vec<&str> = result.lines().collect();
1224 assert_eq!(
1225 lines[1], " let x = 1;",
1226 "8-space indent should become 4-space"
1227 );
1228 assert_eq!(
1229 lines[2], " let y = 2;",
1230 "8-space indent should become 4-space"
1231 );
1232
1233 let with_trailing = "fn bar() \n return 1; \n";
1235 let result2 = minify_whitespace(with_trailing);
1236 assert!(
1237 result2.lines().all(|l| !l.ends_with(' ')),
1238 "trailing whitespace should be stripped"
1239 );
1240
1241 let with_blanks = "a\n\n\n\nb\n";
1243 let result3 = minify_whitespace(with_blanks);
1244 let blank_runs: Vec<usize> = {
1246 let mut runs = Vec::new();
1247 let mut count = 0usize;
1248 for line in result3.lines() {
1249 if line.is_empty() {
1250 count += 1;
1251 } else {
1252 if count > 0 {
1253 runs.push(count);
1254 }
1255 count = 0;
1256 }
1257 }
1258 runs
1259 };
1260 assert!(
1261 blank_runs.iter().all(|&n| n <= 1),
1262 "3+ blank lines should collapse to 1, got runs: {blank_runs:?}"
1263 );
1264 }
1265
1266 #[test]
1277 fn chunk_content_kind_code_for_rust_file() {
1278 let source = "fn hello() {}\n";
1279 let config = crate::languages::config_for_extension("rs").unwrap();
1280 let chunks = chunk_file(
1281 Path::new("src/lib.rs"),
1282 source,
1283 &config,
1284 &ChunkConfig::default(),
1285 );
1286 assert!(!chunks.is_empty(), "expected at least one chunk");
1287 assert!(
1288 chunks.iter().all(|c| c.content_kind == ContentKind::Code),
1289 "all chunks from a .rs file must have ContentKind::Code; got: {:?}",
1290 chunks.iter().map(|c| c.content_kind).collect::<Vec<_>>()
1291 );
1292 }
1293
1294 #[test]
1297 fn chunk_content_kind_meta_for_json_file() {
1298 let source = r#"{"key": "value", "answer": 42}"#;
1299 let config = crate::languages::config_for_extension("json").unwrap();
1300 let chunks = chunk_file(
1301 Path::new("data.json"),
1302 source,
1303 &config,
1304 &ChunkConfig::default(),
1305 );
1306 assert!(!chunks.is_empty(), "expected at least one chunk");
1307 assert!(
1308 chunks.iter().all(|c| c.content_kind == ContentKind::Meta),
1309 "all chunks from a .json file must have ContentKind::Meta; got: {:?}",
1310 chunks.iter().map(|c| c.content_kind).collect::<Vec<_>>()
1311 );
1312 }
1313
1314 #[test]
1317 fn chunk_content_kind_docs_for_md_file() {
1318 let source = "# Title\n\nSome prose content.\n";
1319 let config = crate::languages::config_for_extension("md").unwrap();
1320 let chunks = chunk_file(
1321 Path::new("README.md"),
1322 source,
1323 &config,
1324 &ChunkConfig::default(),
1325 );
1326 assert!(!chunks.is_empty(), "expected at least one chunk");
1327 assert!(
1328 chunks.iter().all(|c| c.content_kind == ContentKind::Docs),
1329 "all chunks from a .md file must have ContentKind::Docs; got: {:?}",
1330 chunks.iter().map(|c| c.content_kind).collect::<Vec<_>>()
1331 );
1332 }
1333
1334 #[test]
1337 fn chunk_content_kind_meta_for_yaml_toml_xml() {
1338 let yaml_source = "key: value\nanother: 42\n";
1340 let yaml_config = crate::languages::config_for_extension("yaml").unwrap();
1341 let yaml_chunks = chunk_file(
1342 Path::new("config.yaml"),
1343 yaml_source,
1344 &yaml_config,
1345 &ChunkConfig::default(),
1346 );
1347 assert!(!yaml_chunks.is_empty(), "expected yaml chunks");
1348 assert!(
1349 yaml_chunks
1350 .iter()
1351 .all(|c| c.content_kind == ContentKind::Meta),
1352 "yaml chunks must be Meta; got: {:?}",
1353 yaml_chunks
1354 .iter()
1355 .map(|c| c.content_kind)
1356 .collect::<Vec<_>>()
1357 );
1358
1359 let toml_source = "[section]\nkey = \"value\"\n";
1361 let toml_config = crate::languages::config_for_extension("toml").unwrap();
1362 let toml_chunks = chunk_file(
1363 Path::new("Cargo.toml"),
1364 toml_source,
1365 &toml_config,
1366 &ChunkConfig::default(),
1367 );
1368 assert!(!toml_chunks.is_empty(), "expected toml chunks");
1369 assert!(
1370 toml_chunks
1371 .iter()
1372 .all(|c| c.content_kind == ContentKind::Meta),
1373 "toml chunks must be Meta; got: {:?}",
1374 toml_chunks
1375 .iter()
1376 .map(|c| c.content_kind)
1377 .collect::<Vec<_>>()
1378 );
1379
1380 let xml_source = r#"<?xml version="1.0"?><root><item>hello</item></root>"#;
1382 let xml_config = crate::languages::config_for_extension("xml").unwrap();
1383 let xml_chunks = chunk_file(
1384 Path::new("data.xml"),
1385 xml_source,
1386 &xml_config,
1387 &ChunkConfig::default(),
1388 );
1389 assert!(!xml_chunks.is_empty(), "expected xml chunks");
1390 assert!(
1391 xml_chunks
1392 .iter()
1393 .all(|c| c.content_kind == ContentKind::Meta),
1394 "xml chunks must be Meta; got: {:?}",
1395 xml_chunks
1396 .iter()
1397 .map(|c| c.content_kind)
1398 .collect::<Vec<_>>()
1399 );
1400 }
1401
1402 #[test]
1406 fn content_kind_from_extension_covers_code_docs_meta() {
1407 for ext in [
1409 "rs", "py", "ts", "go", "java", "cpp", "sh", "rb", "kt", "swift", "scala",
1410 ] {
1411 assert_eq!(
1412 ContentKind::from_extension(ext),
1413 ContentKind::Code,
1414 ".{ext} should be Code"
1415 );
1416 }
1417 for ext in ["md", "rst", "txt", "adoc", "org"] {
1419 assert_eq!(
1420 ContentKind::from_extension(ext),
1421 ContentKind::Docs,
1422 ".{ext} should be Docs"
1423 );
1424 }
1425 for ext in [
1427 "json", "yaml", "yml", "toml", "xml", "lock", "snap", "csv", "tsv", "proto",
1428 ] {
1429 assert_eq!(
1430 ContentKind::from_extension(ext),
1431 ContentKind::Meta,
1432 ".{ext} should be Meta"
1433 );
1434 }
1435 }
1436
1437 #[test]
1440 fn sliding_window_chunks_carry_content_kind() {
1441 let source = "just some plain text with no tree-sitter grammar support\n";
1444 let chunks = chunk_source_for_path(
1445 Path::new("notes.txt"),
1446 source,
1447 false,
1448 &ChunkConfig::default(),
1449 );
1450 assert!(!chunks.is_empty(), "expected at least one chunk");
1451 assert!(
1452 chunks.iter().all(|c| c.content_kind == ContentKind::Docs),
1453 "notes.txt chunks must be Docs; got: {:?}",
1454 chunks.iter().map(|c| c.content_kind).collect::<Vec<_>>()
1455 );
1456 }
1457}