1use std::path::Path;
9use streaming_iterator::StreamingIterator;
10use tree_sitter::{Parser, QueryCursor};
11
12#[derive(Debug, Clone)]
17pub struct ChunkConfig {
18 pub max_chunk_bytes: usize,
22 pub window_size: usize,
26 pub window_overlap: usize,
30}
31
32impl Default for ChunkConfig {
33 fn default() -> Self {
34 Self {
35 max_chunk_bytes: 4096,
36 window_size: 2048,
37 window_overlap: 512,
38 }
39 }
40}
41
42#[derive(
44 Debug,
45 Clone,
46 rkyv::Archive,
47 rkyv::Serialize,
48 rkyv::Deserialize,
49 bitcode::Encode,
50 bitcode::Decode,
51)]
52pub struct CodeChunk {
53 pub file_path: String,
55 pub name: String,
57 pub kind: String,
59 pub start_line: usize,
61 pub end_line: usize,
63 pub content: String,
65 pub enriched_content: String,
68}
69
70#[must_use]
77pub fn build_scope_chain(node: tree_sitter::Node<'_>, source: &str) -> String {
78 const CONTAINER_KINDS: &[&str] = &[
80 "impl_item",
82 "trait_item",
83 "mod_item",
84 "class_definition",
86 "module",
87 "class_declaration",
89 "type_declaration",
93 "namespace_definition",
95 "class_specifier",
96 ];
97
98 const NAME_FIELDS: &[&str] = &["name", "type"];
102
103 let mut parts = Vec::new();
104 let mut current = node.parent();
105 while let Some(parent) = current {
106 let kind = parent.kind();
107 if CONTAINER_KINDS.contains(&kind) {
108 let name = NAME_FIELDS
109 .iter()
110 .find_map(|field| parent.child_by_field_name(field))
111 .map_or(kind, |n| &source[n.start_byte()..n.end_byte()]);
112 parts.push(format!("{kind} {name}"));
113 }
114 current = parent.parent();
115 }
116 parts.reverse();
117 parts.join(" > ")
118}
119
120#[must_use]
126pub fn extract_signature(node: tree_sitter::Node<'_>, source: &str) -> Option<String> {
127 let name_node = node.child_by_field_name("name")?;
128 let body_node = node
129 .child_by_field_name("body")
130 .or_else(|| node.child_by_field_name("block"))?;
131 let start = name_node.start_byte();
132 let end = body_node.start_byte();
133 if start >= end {
134 return None;
135 }
136 let sig = source[start..end].trim();
137 if sig.is_empty() {
138 None
139 } else {
140 Some(sig.to_string())
141 }
142}
143
144#[must_use]
155pub fn minify_whitespace(source: &str) -> String {
156 let mut result = String::with_capacity(source.len());
157 let mut consecutive_blank = 0usize;
158
159 for line in source.lines() {
160 let leading = line
162 .chars()
163 .take_while(|c| *c == ' ' || *c == '\t')
164 .fold(0usize, |acc, c| acc + if c == '\t' { 2 } else { 1 });
165 let rest = line.trim_start();
166
167 if rest.is_empty() {
168 consecutive_blank += 1;
171 if consecutive_blank == 1 {
172 result.push('\n');
173 }
174 } else {
175 consecutive_blank = 0;
176 let indent_level = leading.div_ceil(2);
179 for _ in 0..indent_level {
180 result.push(' ');
181 }
182 result.push_str(rest.trim_end());
183 result.push('\n');
184 }
185 }
186
187 if !source.ends_with('\n') && result.ends_with('\n') {
189 result.pop();
190 }
191
192 result
193}
194
195fn build_enriched_content(
200 path: &Path,
201 node: tree_sitter::Node<'_>,
202 source: &str,
203 content: &str,
204 max_bytes: usize,
205) -> String {
206 let scope = build_scope_chain(node, source);
207 let sig = extract_signature(node, source).unwrap_or_default();
208 let rel_path = path.display().to_string();
209
210 let header = if scope.is_empty() && sig.is_empty() {
211 format!("// {rel_path}\n")
212 } else if scope.is_empty() {
213 format!("// {rel_path} | defines: {sig}\n")
214 } else if sig.is_empty() {
215 format!("// {rel_path} | {scope}\n")
216 } else {
217 format!("// {rel_path} | {scope} | defines: {sig}\n")
218 };
219
220 let minified = minify_whitespace(content);
223
224 if header.len() + minified.len() > max_bytes {
225 minified
226 } else {
227 format!("{header}{minified}")
228 }
229}
230
231#[must_use]
240pub fn chunk_file(
241 path: &Path,
242 source: &str,
243 config: &crate::languages::LangConfig,
244 chunk_config: &ChunkConfig,
245) -> Vec<CodeChunk> {
246 let mut parser = Parser::new();
247 if parser.set_language(&config.language).is_err() {
248 return sliding_windows(path, source, chunk_config);
249 }
250
251 let Some(tree) = parser.parse(source, None) else {
252 return sliding_windows(path, source, chunk_config);
253 };
254
255 let mut cursor = QueryCursor::new();
256 let mut chunks = Vec::new();
257 let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
258
259 while let Some(m) = matches.next() {
260 let mut name = String::new();
261 let mut def_node = None;
262 for cap in m.captures {
263 let cap_name = &config.query.capture_names()[cap.index as usize];
264 if *cap_name == "name" {
265 name = source[cap.node.start_byte()..cap.node.end_byte()].to_string();
266 } else if *cap_name == "def" {
267 def_node = Some(cap.node);
268 }
269 }
270 if let Some(node) = def_node {
271 let content = &source[node.start_byte()..node.end_byte()];
272 let start_line = node.start_position().row + 1;
273
274 if content.len() > chunk_config.max_chunk_bytes {
276 chunks.extend(sliding_windows_with_name(
277 path,
278 content,
279 &name,
280 start_line,
281 chunk_config,
282 ));
283 } else {
284 let enriched = build_enriched_content(
285 path,
286 node,
287 source,
288 content,
289 chunk_config.max_chunk_bytes,
290 );
291 chunks.push(CodeChunk {
292 file_path: path.display().to_string(),
293 name,
294 kind: node.kind().to_string(),
295 start_line,
296 end_line: node.end_position().row + 1,
297 enriched_content: enriched,
298 content: content.to_string(),
299 });
300 }
301 }
302 }
303
304 if chunks.is_empty() && !source.trim().is_empty() {
306 return sliding_windows(path, source, chunk_config);
307 }
308
309 chunks
310}
311
312#[must_use]
322pub fn chunk_text(path: &Path, source: &str, chunk_config: &ChunkConfig) -> Vec<CodeChunk> {
323 sliding_windows(path, source, chunk_config)
324}
325
326#[must_use]
328pub fn is_rdf_text_extension(ext: &str) -> bool {
329 matches!(
330 ext.to_ascii_lowercase().as_str(),
331 "ttl" | "nt" | "n3" | "trig" | "nq"
332 )
333}
334
335#[must_use]
342pub fn chunk_rdf_text(path: &Path, source: &str, chunk_config: &ChunkConfig) -> Vec<CodeChunk> {
343 if source.trim().is_empty() {
344 return vec![];
345 }
346
347 let mut chunks = Vec::new();
348 let mut current = String::new();
349 let mut current_start_line = 1usize;
350 let mut current_is_directive = false;
351
352 for (line_idx, line) in source.lines().enumerate() {
353 let line_no = line_idx + 1;
354 let trimmed = line.trim();
355 if trimmed.is_empty() {
356 flush_rdf_block(
357 path,
358 ¤t,
359 current_start_line,
360 chunk_config,
361 &mut chunks,
362 );
363 current.clear();
364 current_is_directive = false;
365 continue;
366 }
367
368 let line_is_directive = is_rdf_directive(trimmed);
369 if !current.is_empty() && current_is_directive && !line_is_directive {
370 flush_rdf_block(
371 path,
372 ¤t,
373 current_start_line,
374 chunk_config,
375 &mut chunks,
376 );
377 current.clear();
378 current_is_directive = false;
379 }
380
381 if current.is_empty() {
382 current_start_line = line_no;
383 current_is_directive = line_is_directive;
384 }
385 current.push_str(line);
386 current.push('\n');
387
388 if !current_is_directive && trimmed.ends_with('.') {
389 flush_rdf_block(
390 path,
391 ¤t,
392 current_start_line,
393 chunk_config,
394 &mut chunks,
395 );
396 current.clear();
397 current_is_directive = false;
398 }
399 }
400
401 flush_rdf_block(
402 path,
403 ¤t,
404 current_start_line,
405 chunk_config,
406 &mut chunks,
407 );
408 if chunks.is_empty() {
409 sliding_windows(path, source, chunk_config)
410 } else {
411 chunks
412 }
413}
414
415#[must_use]
417pub fn chunk_source_for_path(
418 path: &Path,
419 source: &str,
420 text_mode: bool,
421 chunk_config: &ChunkConfig,
422) -> Vec<CodeChunk> {
423 if text_mode {
424 return chunk_text(path, source, chunk_config);
425 }
426
427 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
428 if let Some(lang_config) = crate::languages::config_for_extension(ext) {
429 chunk_file(path, source, &lang_config, chunk_config)
430 } else if is_rdf_text_extension(ext) {
431 chunk_rdf_text(path, source, chunk_config)
432 } else {
433 chunk_text(path, source, chunk_config)
434 }
435}
436
437fn is_rdf_directive(trimmed: &str) -> bool {
438 trimmed.starts_with("@prefix")
439 || trimmed.starts_with("@base")
440 || trimmed.starts_with("PREFIX")
441 || trimmed.starts_with("BASE")
442}
443
444fn flush_rdf_block(
445 path: &Path,
446 content: &str,
447 start_line: usize,
448 chunk_config: &ChunkConfig,
449 chunks: &mut Vec<CodeChunk>,
450) {
451 let trimmed = content.trim();
452 if trimmed.is_empty() {
453 return;
454 }
455 let name = rdf_block_name(trimmed, path);
456 let content = format!("{trimmed}\n");
457 if content.len() > chunk_config.max_chunk_bytes {
458 chunks.extend(sliding_window_chunks(
459 &content,
460 path,
461 &name,
462 start_line,
463 chunk_config,
464 ));
465 return;
466 }
467 let header = format!("# {} | rdf: {name}\n", path.display());
468 let enriched_content = if header.len() + content.len() <= chunk_config.max_chunk_bytes {
469 format!("{header}{content}")
470 } else {
471 content.clone()
472 };
473 let line_count = content.lines().count().max(1);
474 chunks.push(CodeChunk {
475 file_path: path.display().to_string(),
476 name,
477 kind: "rdf_statements".to_string(),
478 start_line,
479 end_line: start_line + line_count - 1,
480 enriched_content,
481 content,
482 });
483}
484
485fn rdf_block_name(content: &str, path: &Path) -> String {
486 let first = content
487 .lines()
488 .map(str::trim)
489 .find(|line| !line.is_empty() && !line.starts_with('#'));
490 let Some(first) = first else {
491 return path
492 .file_name()
493 .unwrap_or_default()
494 .to_string_lossy()
495 .to_string();
496 };
497
498 if first.starts_with("@prefix") || first.starts_with("PREFIX") {
499 return "@prefix".to_string();
500 }
501 if first.starts_with("@base") || first.starts_with("BASE") {
502 return "@base".to_string();
503 }
504
505 let token = first
506 .split_whitespace()
507 .next()
508 .unwrap_or("")
509 .trim_end_matches([';', ',', '.']);
510 if token.is_empty() {
511 path.file_name()
512 .unwrap_or_default()
513 .to_string_lossy()
514 .to_string()
515 } else {
516 token.to_string()
517 }
518}
519
520fn sliding_windows(path: &Path, source: &str, chunk_config: &ChunkConfig) -> Vec<CodeChunk> {
522 if source.trim().is_empty() {
523 return vec![];
524 }
525
526 if source.len() <= chunk_config.max_chunk_bytes {
528 let content = source.to_string();
529 return vec![CodeChunk {
530 file_path: path.display().to_string(),
531 name: path
532 .file_name()
533 .unwrap_or_default()
534 .to_string_lossy()
535 .to_string(),
536 kind: "file".to_string(),
537 start_line: 1,
538 end_line: source.lines().count(),
539 enriched_content: content.clone(),
540 content,
541 }];
542 }
543
544 let file_name = path
545 .file_name()
546 .unwrap_or_default()
547 .to_string_lossy()
548 .to_string();
549 sliding_window_chunks(source, path, &file_name, 1, chunk_config)
550}
551
552fn sliding_windows_with_name(
557 path: &Path,
558 content: &str,
559 name: &str,
560 base_line: usize,
561 chunk_config: &ChunkConfig,
562) -> Vec<CodeChunk> {
563 sliding_window_chunks(content, path, name, base_line, chunk_config)
564}
565
566fn sliding_window_chunks(
573 source: &str,
574 file_path: &Path,
575 name_prefix: &str,
576 base_line: usize,
577 chunk_config: &ChunkConfig,
578) -> Vec<CodeChunk> {
579 let step = chunk_config
580 .window_size
581 .saturating_sub(chunk_config.window_overlap)
582 .max(1);
583 let bytes = source.as_bytes();
584 let mut chunks = Vec::new();
585 let mut offset = 0;
586 let mut window_idx = 0;
587
588 while offset < bytes.len() {
589 let raw_end = (offset + chunk_config.window_size).min(bytes.len());
590
591 let end = if raw_end < bytes.len() {
593 match bytes[offset..raw_end].iter().rposition(|&b| b == b'\n') {
594 Some(pos) => offset + pos + 1,
595 None => raw_end, }
597 } else {
598 raw_end
599 };
600
601 if let Ok(window) = std::str::from_utf8(&bytes[offset..end])
603 && !window.trim().is_empty()
604 {
605 let start_line = base_line + source[..offset].matches('\n').count();
606 let content_lines = window.lines().count().max(1);
607 let end_line = start_line + content_lines - 1;
608 let content = window.to_string();
609 chunks.push(CodeChunk {
610 file_path: file_path.display().to_string(),
611 name: format!("{name_prefix}[{window_idx}]"),
612 kind: "window".to_string(),
613 start_line,
614 end_line,
615 enriched_content: content.clone(),
616 content,
617 });
618 window_idx += 1;
619 }
620
621 offset += step;
622 }
623
624 chunks
625}
626
627#[cfg(test)]
628mod tests {
629 use super::*;
630 use std::fmt::Write as _;
631 use std::path::Path;
632
633 #[test]
634 fn chunks_rust_functions_and_structs() {
635 let source = "fn hello() { println!(\"hi\"); }\nfn world() {}\nstruct Foo { x: i32 }";
636 let config = crate::languages::config_for_extension("rs").unwrap();
637 let chunks = chunk_file(
638 Path::new("test.rs"),
639 source,
640 &config,
641 &ChunkConfig::default(),
642 );
643 assert!(
644 chunks.len() >= 2,
645 "expected at least 2 chunks, got {}",
646 chunks.len()
647 );
648 assert!(chunks.iter().any(|c| c.name == "hello"));
649 assert!(chunks.iter().any(|c| c.name == "world"));
650 }
651
652 #[test]
653 fn chunks_python_functions_and_classes() {
654 let source = "def greet(name):\n pass\n\nclass Foo:\n pass\n";
655 let config = crate::languages::config_for_extension("py").unwrap();
656 let chunks = chunk_file(
657 Path::new("test.py"),
658 source,
659 &config,
660 &ChunkConfig::default(),
661 );
662 assert!(chunks.len() >= 2);
663 assert!(chunks.iter().any(|c| c.name == "greet"));
664 assert!(chunks.iter().any(|c| c.name == "Foo"));
665 }
666
667 #[test]
668 fn chunks_python_stub_functions_and_classes() {
669 let source = "from typing import Protocol\n\ndef greet(name: str) -> str: ...\n\nclass Foo(Protocol):\n value: int\n";
670 let config = crate::languages::config_for_extension("pyi").unwrap();
671 let chunks = chunk_file(
672 Path::new("test.pyi"),
673 source,
674 &config,
675 &ChunkConfig::default(),
676 );
677 assert!(chunks.len() >= 2);
678 assert!(chunks.iter().any(|c| c.name == "greet"));
679 assert!(chunks.iter().any(|c| c.name == "Foo"));
680 }
681
682 #[test]
683 fn fallback_small_file_single_chunk() {
684 let source = "// just a comment\n// and another\n";
687 let config = crate::languages::config_for_extension("js").unwrap();
688 let chunks = chunk_file(
689 Path::new("script.js"),
690 source,
691 &config,
692 &ChunkConfig::default(),
693 );
694 assert_eq!(chunks.len(), 1);
695 assert_eq!(chunks[0].kind, "file");
696 }
697
698 #[test]
699 fn fallback_large_file_produces_windows() {
700 let line = "console.log('hello world, this is a long line of javascript code');\n";
702 let source: String = line.repeat(200); let chunk_config = ChunkConfig::default();
704 assert!(source.len() > chunk_config.max_chunk_bytes);
705
706 let config = crate::languages::config_for_extension("js").unwrap();
707 let chunks = chunk_file(Path::new("big.js"), &source, &config, &chunk_config);
708 assert!(
709 chunks.len() > 1,
710 "expected multiple windows, got {}",
711 chunks.len()
712 );
713 assert!(chunks.iter().all(|c| c.kind == "window"));
714 assert!(chunks[0].name.contains("[0]"));
715 }
716
717 #[test]
718 fn large_definition_is_windowed() {
719 let mut source = String::from("fn big_function() {\n");
721 for i in 0..200 {
722 writeln!(source, " let var_{i} = {i} * 2 + 1; // some computation").unwrap();
723 }
724 source.push_str("}\n");
725 let chunk_config = ChunkConfig::default();
726 assert!(source.len() > chunk_config.max_chunk_bytes);
727
728 let config = crate::languages::config_for_extension("rs").unwrap();
729 let chunks = chunk_file(Path::new("test.rs"), &source, &config, &chunk_config);
730 assert!(
731 chunks.len() > 1,
732 "expected windowed chunks, got {}",
733 chunks.len()
734 );
735 assert!(chunks[0].name.starts_with("big_function["));
736 }
737
738 #[test]
739 fn empty_file_produces_no_chunks() {
740 let config = crate::languages::config_for_extension("rs").unwrap();
741 let chunks = chunk_file(Path::new("empty.rs"), "", &config, &ChunkConfig::default());
742 assert!(chunks.is_empty());
743 }
744
745 fn first_def_node(
749 source: &str,
750 ext: &str,
751 ) -> (
752 tree_sitter::Tree,
753 std::sync::Arc<crate::languages::LangConfig>,
754 ) {
755 let config = crate::languages::config_for_extension(ext).unwrap();
756 let mut parser = Parser::new();
757 parser.set_language(&config.language).unwrap();
758 let tree = parser.parse(source, None).unwrap();
759 (tree, config)
760 }
761
762 #[test]
763 fn scope_chain_rust_impl_method() {
764 let source = "impl Foo {\n fn bar(&self) {}\n}";
765 let (tree, config) = first_def_node(source, "rs");
766 let mut cursor = QueryCursor::new();
767 let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
768
769 let mut def_node = None;
770 while let Some(m) = StreamingIterator::next(&mut matches) {
771 for cap in m.captures {
772 let cap_name = &config.query.capture_names()[cap.index as usize];
773 if *cap_name == "def" {
774 def_node = Some(cap.node);
775 }
776 }
777 }
778 let node = def_node.expect("should find a @def node");
779 let scope = build_scope_chain(node, source);
780 assert!(
781 scope.contains("impl_item"),
782 "scope should contain impl_item, got: {scope}"
783 );
784 assert!(
785 scope.contains("Foo"),
786 "scope should contain 'Foo', got: {scope}"
787 );
788 }
789
790 #[test]
791 fn scope_chain_python_class_method() {
792 let source = "class Greeter:\n def say_hello(self):\n pass\n";
793 let (tree, config) = first_def_node(source, "py");
794 let mut cursor = QueryCursor::new();
795 let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
796
797 let mut fn_node = None;
799 while let Some(m) = StreamingIterator::next(&mut matches) {
800 for cap in m.captures {
801 let cap_name = &config.query.capture_names()[cap.index as usize];
802 if *cap_name == "def" && cap.node.kind() == "function_definition" {
803 fn_node = Some(cap.node);
804 }
805 }
806 }
807 let node = fn_node.expect("should find say_hello @def node");
808 let scope = build_scope_chain(node, source);
809 assert!(
810 scope.contains("class_definition"),
811 "scope should contain class_definition, got: {scope}"
812 );
813 assert!(
814 scope.contains("Greeter"),
815 "scope should contain 'Greeter', got: {scope}"
816 );
817 }
818
819 #[test]
820 fn extract_signature_rust_function() {
821 let source = "fn greet(name: &str) -> String { name.to_string() }";
822 let (tree, config) = first_def_node(source, "rs");
823 let mut cursor = QueryCursor::new();
824 let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
825
826 let mut def_node = None;
827 while let Some(m) = StreamingIterator::next(&mut matches) {
828 for cap in m.captures {
829 let cap_name = &config.query.capture_names()[cap.index as usize];
830 if *cap_name == "def" {
831 def_node = Some(cap.node);
832 }
833 }
834 }
835 let node = def_node.expect("should find @def node");
836 let sig = extract_signature(node, source).expect("should extract signature");
837 assert!(
838 sig.contains("greet"),
839 "signature should contain 'greet', got: {sig}"
840 );
841 assert!(
842 sig.contains("name: &str"),
843 "signature should contain parameter, got: {sig}"
844 );
845 assert!(
846 sig.contains("-> String"),
847 "signature should contain return type, got: {sig}"
848 );
849 }
850
851 #[test]
852 fn enriched_content_has_header() {
853 let source = "fn hello() { println!(\"hi\"); }";
854 let config = crate::languages::config_for_extension("rs").unwrap();
855 let chunks = chunk_file(
856 Path::new("src/main.rs"),
857 source,
858 &config,
859 &ChunkConfig::default(),
860 );
861 assert!(!chunks.is_empty());
862 let chunk = &chunks[0];
863 assert!(
864 chunk.enriched_content.starts_with("//"),
865 "enriched_content should start with '//' header, got: {}",
866 &chunk.enriched_content[..chunk.enriched_content.len().min(80)]
867 );
868 assert!(
869 chunk.enriched_content.contains("src/main.rs"),
870 "enriched_content should contain file path"
871 );
872 assert!(
874 !chunk.content.starts_with("//"),
875 "raw content should not start with header"
876 );
877 }
878
879 #[test]
880 fn sliding_window_enriched_equals_content() {
881 let source = "let x = 42;\nconsole.log(x);\n";
882 let chunks = chunk_text(Path::new("test.txt"), source, &ChunkConfig::default());
883 assert!(!chunks.is_empty());
884 for chunk in &chunks {
885 assert_eq!(
886 chunk.enriched_content, chunk.content,
887 "sliding window chunks should have enriched_content == content"
888 );
889 }
890 }
891
892 #[test]
893 fn chunks_rdf_xml_and_owl_elements_with_tree_sitter() {
894 let source = r#"<?xml version="1.0"?>
895<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
896 xmlns:owl="http://www.w3.org/2002/07/owl#">
897 <owl:Class rdf:about="http://example.com/Person"/>
898 <owl:ObjectProperty rdf:about="http://example.com/knows"/>
899</rdf:RDF>"#;
900 let rdf_config = crate::languages::config_for_extension("rdf").unwrap();
901 let owl_config = crate::languages::config_for_extension("owl").unwrap();
902
903 let rdf_chunks = chunk_file(
904 Path::new("ontology.rdf"),
905 source,
906 &rdf_config,
907 &ChunkConfig::default(),
908 );
909 let owl_chunks = chunk_file(
910 Path::new("ontology.owl"),
911 source,
912 &owl_config,
913 &ChunkConfig::default(),
914 );
915
916 assert!(rdf_chunks.iter().any(|chunk| chunk.name == "owl:Class"));
917 assert!(
918 rdf_chunks
919 .iter()
920 .any(|chunk| chunk.name == "owl:ObjectProperty")
921 );
922 assert!(rdf_chunks.iter().all(|chunk| chunk.kind == "element"));
923 assert!(owl_chunks.iter().any(|chunk| chunk.name == "owl:Class"));
924 }
925
926 #[test]
927 fn chunks_turtle_by_rdf_statement_blocks() {
928 let source = r#"@prefix ex: <http://example.com/> .
929@prefix owl: <http://www.w3.org/2002/07/owl#> .
930
931ex:Person
932 a owl:Class ;
933 ex:label "Person" .
934
935ex:knows
936 a owl:ObjectProperty ;
937 ex:domain ex:Person ;
938 ex:range ex:Person .
939"#;
940
941 let chunks = chunk_rdf_text(Path::new("ontology.ttl"), source, &ChunkConfig::default());
942
943 assert_eq!(chunks.len(), 3);
944 assert_eq!(chunks[0].kind, "rdf_statements");
945 assert_eq!(chunks[0].name, "@prefix");
946 assert_eq!(chunks[1].name, "ex:Person");
947 assert_eq!(chunks[2].name, "ex:knows");
948 }
949
950 #[test]
951 fn header_dropped_when_exceeding_max_bytes() {
952 let tiny_config = ChunkConfig {
955 max_chunk_bytes: 60,
956 window_size: 30,
957 window_overlap: 10,
958 };
959 let source = "fn f() { let x = 42; return x; }";
961 assert!(source.len() <= tiny_config.max_chunk_bytes);
962
963 let config = crate::languages::config_for_extension("rs").unwrap();
964 let chunks = chunk_file(
965 Path::new("long/path/to/file.rs"),
966 source,
967 &config,
968 &tiny_config,
969 );
970 assert!(!chunks.is_empty());
971 let chunk = &chunks[0];
972 assert!(
976 !chunk.enriched_content.starts_with("//"),
977 "header should be dropped when it would exceed max_chunk_bytes"
978 );
979 assert_eq!(chunk.content, source, "raw content should be unchanged");
980 }
981
982 #[test]
983 fn minify_whitespace_normalizes_indent_and_strips_trailing() {
984 let source = "fn foo() {\n let x = 1;\n let y = 2;\n}\n";
986 let result = minify_whitespace(source);
987 let lines: Vec<&str> = result.lines().collect();
988 assert_eq!(
989 lines[1], " let x = 1;",
990 "8-space indent should become 4-space"
991 );
992 assert_eq!(
993 lines[2], " let y = 2;",
994 "8-space indent should become 4-space"
995 );
996
997 let with_trailing = "fn bar() \n return 1; \n";
999 let result2 = minify_whitespace(with_trailing);
1000 assert!(
1001 result2.lines().all(|l| !l.ends_with(' ')),
1002 "trailing whitespace should be stripped"
1003 );
1004
1005 let with_blanks = "a\n\n\n\nb\n";
1007 let result3 = minify_whitespace(with_blanks);
1008 let blank_runs: Vec<usize> = {
1010 let mut runs = Vec::new();
1011 let mut count = 0usize;
1012 for line in result3.lines() {
1013 if line.is_empty() {
1014 count += 1;
1015 } else {
1016 if count > 0 {
1017 runs.push(count);
1018 }
1019 count = 0;
1020 }
1021 }
1022 runs
1023 };
1024 assert!(
1025 blank_runs.iter().all(|&n| n <= 1),
1026 "3+ blank lines should collapse to 1, got runs: {blank_runs:?}"
1027 );
1028 }
1029}