Skip to main content

ripvec_core/
chunk.rs

1//! Tree-sitter based code chunking with sliding-window fallback.
2//!
3//! Parses source files into ASTs and extracts semantic chunks at
4//! function, class, and method boundaries. For files without recognized
5//! semantic structure (or very large fallback chunks), splits into
6//! overlapping sliding windows for uniform embedding sizes.
7
8use std::path::Path;
9use streaming_iterator::StreamingIterator;
10use tree_sitter::{Parser, QueryCursor};
11
12/// Runtime configuration for the chunking pipeline.
13///
14/// All size values are in bytes. Pass to [`chunk_file`] to control
15/// chunk sizing without recompilation.
16#[derive(Debug, Clone)]
17pub struct ChunkConfig {
18    /// Maximum chunk content length in bytes before splitting into windows.
19    /// Chunks larger than this are split even if tree-sitter found them as
20    /// a single definition (e.g., a 500-line function).
21    pub max_chunk_bytes: usize,
22    /// Target window size in bytes for the sliding-window fallback chunker.
23    /// ~2KB of source text ≈ 128-256 tokens after BPE, well within the
24    /// 512-token model limit and optimal for embedding quality.
25    pub window_size: usize,
26    /// Overlap between adjacent windows in bytes.
27    /// Ensures definitions spanning a window boundary are captured in at
28    /// least one window. Defaults to 25% of `window_size`.
29    pub window_overlap: usize,
30}
31
32impl Default for ChunkConfig {
33    fn default() -> Self {
34        Self {
35            max_chunk_bytes: 4096,
36            window_size: 2048,
37            window_overlap: 512,
38        }
39    }
40}
41
42/// A semantic chunk extracted from a source file.
43#[derive(
44    Debug,
45    Clone,
46    rkyv::Archive,
47    rkyv::Serialize,
48    rkyv::Deserialize,
49    bitcode::Encode,
50    bitcode::Decode,
51)]
52pub struct CodeChunk {
53    /// Path to the source file.
54    pub file_path: String,
55    /// Name of the definition (function name, class name, etc.).
56    pub name: String,
57    /// Kind of syntax node (e.g., `function_item`, `class_definition`).
58    pub kind: String,
59    /// 1-based start line number.
60    pub start_line: usize,
61    /// 1-based end line number.
62    pub end_line: usize,
63    /// Source text of the chunk (raw code for display).
64    pub content: String,
65    /// Enriched content with scope chain and signature metadata for embedding.
66    /// Falls back to `content` if metadata would exceed chunk size limits.
67    pub enriched_content: String,
68}
69
70/// Walk up the AST parent chain collecting structural container names.
71///
72/// Produces a scope chain like `"impl_item Foo > fn forward"` by
73/// identifying structural containers (impl blocks, classes, modules, namespaces)
74/// and extracting their names. Tries the `name` field first, then `type`
75/// (for Rust `impl_item` which uses `type` instead of `name`).
76#[must_use]
77pub fn build_scope_chain(node: tree_sitter::Node<'_>, source: &str) -> String {
78    /// Node kinds that represent structural containers, by language.
79    const CONTAINER_KINDS: &[&str] = &[
80        // Rust
81        "impl_item",
82        "trait_item",
83        "mod_item",
84        // Python
85        "class_definition",
86        "module",
87        // JS/TS
88        "class_declaration",
89        // Java
90        // "class_declaration" already covered above
91        // Go
92        "type_declaration",
93        // C++
94        "namespace_definition",
95        "class_specifier",
96    ];
97
98    /// Field names to try when extracting the container name.
99    /// `impl_item` uses `type` instead of `name`; Go `type_declaration`
100    /// has no fields, so we fall back to the node kind.
101    const NAME_FIELDS: &[&str] = &["name", "type"];
102
103    let mut parts = Vec::new();
104    let mut current = node.parent();
105    while let Some(parent) = current {
106        let kind = parent.kind();
107        if CONTAINER_KINDS.contains(&kind) {
108            let name = NAME_FIELDS
109                .iter()
110                .find_map(|field| parent.child_by_field_name(field))
111                .map_or(kind, |n| &source[n.start_byte()..n.end_byte()]);
112            parts.push(format!("{kind} {name}"));
113        }
114        current = parent.parent();
115    }
116    parts.reverse();
117    parts.join(" > ")
118}
119
120/// Extract the function/method signature from a definition node.
121///
122/// Returns the text from the function name to the start of the body,
123/// which captures the parameter list and return type (if any).
124/// Returns `None` if the node has no `name` or `body`/`block` field.
125#[must_use]
126pub fn extract_signature(node: tree_sitter::Node<'_>, source: &str) -> Option<String> {
127    let name_node = node.child_by_field_name("name")?;
128    let body_node = node
129        .child_by_field_name("body")
130        .or_else(|| node.child_by_field_name("block"))?;
131    let start = name_node.start_byte();
132    let end = body_node.start_byte();
133    if start >= end {
134        return None;
135    }
136    let sig = source[start..end].trim();
137    if sig.is_empty() {
138        None
139    } else {
140        Some(sig.to_string())
141    }
142}
143
144/// Reduce indentation waste for embedding by normalizing whitespace.
145///
146/// For each line:
147/// - Counts leading spaces/tabs, normalises to 2 spaces per indent level
148///   (4 spaces → 2, 8 spaces → 4, 1 tab → 2 spaces).
149/// - Strips trailing whitespace.
150///
151/// Additionally, 3 or more consecutive blank lines are collapsed to a single
152/// blank line. This reduces the number of whitespace tokens consumed in the
153/// 512-token embedding window without altering visible structure.
154#[must_use]
155pub fn minify_whitespace(source: &str) -> String {
156    let mut result = String::with_capacity(source.len());
157    let mut consecutive_blank = 0usize;
158
159    for line in source.lines() {
160        // Count leading whitespace and determine indent level
161        let leading = line
162            .chars()
163            .take_while(|c| *c == ' ' || *c == '\t')
164            .fold(0usize, |acc, c| acc + if c == '\t' { 2 } else { 1 });
165        let rest = line.trim_start();
166
167        if rest.is_empty() {
168            // Blank line handling: collapse 3+ consecutive blanks to 1.
169            // Only emit the first blank line of a run; suppress the rest.
170            consecutive_blank += 1;
171            if consecutive_blank == 1 {
172                result.push('\n');
173            }
174        } else {
175            consecutive_blank = 0;
176            // Normalise: every 2 spaces of original indent → 1 space of output
177            // (round up so indent level 1 → 1 space, level 2 → 2, etc.)
178            let indent_level = leading.div_ceil(2);
179            for _ in 0..indent_level {
180                result.push(' ');
181            }
182            result.push_str(rest.trim_end());
183            result.push('\n');
184        }
185    }
186
187    // Remove trailing newline added for the last line if source didn't end with one
188    if !source.ends_with('\n') && result.ends_with('\n') {
189        result.pop();
190    }
191
192    result
193}
194
195/// Build the enriched content header for a code chunk.
196///
197/// Prepends scope chain and signature metadata as a comment line.
198/// If the header + content would exceed `max_bytes`, returns `content` unchanged.
199fn build_enriched_content(
200    path: &Path,
201    node: tree_sitter::Node<'_>,
202    source: &str,
203    content: &str,
204    max_bytes: usize,
205) -> String {
206    let scope = build_scope_chain(node, source);
207    let sig = extract_signature(node, source).unwrap_or_default();
208    let rel_path = path.display().to_string();
209
210    let header = if scope.is_empty() && sig.is_empty() {
211        format!("// {rel_path}\n")
212    } else if scope.is_empty() {
213        format!("// {rel_path} | defines: {sig}\n")
214    } else if sig.is_empty() {
215        format!("// {rel_path} | {scope}\n")
216    } else {
217        format!("// {rel_path} | {scope} | defines: {sig}\n")
218    };
219
220    // Minify whitespace for the embedding content to reduce token waste.
221    // The raw `content` field is kept as-is for display.
222    let minified = minify_whitespace(content);
223
224    if header.len() + minified.len() > max_bytes {
225        minified
226    } else {
227        format!("{header}{minified}")
228    }
229}
230
231/// Extract semantic chunks from a source file.
232///
233/// Uses tree-sitter to parse the file and extract definitions matching
234/// the language's query patterns. For files with no semantic matches,
235/// falls back to overlapping sliding windows. Large individual chunks
236/// are also split into windows.
237///
238/// Pass a [`ChunkConfig`] to control chunk sizing at runtime.
239#[must_use]
240pub fn chunk_file(
241    path: &Path,
242    source: &str,
243    config: &crate::languages::LangConfig,
244    chunk_config: &ChunkConfig,
245) -> Vec<CodeChunk> {
246    let mut parser = Parser::new();
247    if parser.set_language(&config.language).is_err() {
248        return sliding_windows(path, source, chunk_config);
249    }
250
251    let Some(tree) = parser.parse(source, None) else {
252        return sliding_windows(path, source, chunk_config);
253    };
254
255    let mut cursor = QueryCursor::new();
256    let mut chunks = Vec::new();
257    let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
258
259    while let Some(m) = matches.next() {
260        let mut name = String::new();
261        let mut def_node = None;
262        for cap in m.captures {
263            let cap_name = &config.query.capture_names()[cap.index as usize];
264            if *cap_name == "name" {
265                name = source[cap.node.start_byte()..cap.node.end_byte()].to_string();
266            } else if *cap_name == "def" {
267                def_node = Some(cap.node);
268            }
269        }
270        if let Some(node) = def_node {
271            let content = &source[node.start_byte()..node.end_byte()];
272            let start_line = node.start_position().row + 1;
273
274            // Split oversized chunks into windows
275            if content.len() > chunk_config.max_chunk_bytes {
276                chunks.extend(sliding_windows_with_name(
277                    path,
278                    content,
279                    &name,
280                    start_line,
281                    chunk_config,
282                ));
283            } else {
284                let enriched = build_enriched_content(
285                    path,
286                    node,
287                    source,
288                    content,
289                    chunk_config.max_chunk_bytes,
290                );
291                chunks.push(CodeChunk {
292                    file_path: path.display().to_string(),
293                    name,
294                    kind: node.kind().to_string(),
295                    start_line,
296                    end_line: node.end_position().row + 1,
297                    enriched_content: enriched,
298                    content: content.to_string(),
299                });
300            }
301        }
302    }
303
304    // Fallback: sliding windows if no semantic matches
305    if chunks.is_empty() && !source.trim().is_empty() {
306        return sliding_windows(path, source, chunk_config);
307    }
308
309    chunks
310}
311
312/// Split source text into overlapping sliding windows.
313///
314/// Each window is `chunk_config.window_size` bytes with `chunk_config.window_overlap` bytes of
315/// overlap. Window boundaries are adjusted to line breaks to avoid
316/// splitting mid-line.
317///
318/// This is used as the fallback for files without tree-sitter support
319/// (plain text, unknown extensions) and for large semantic chunks that
320/// exceed `max_chunk_bytes`.
321#[must_use]
322pub fn chunk_text(path: &Path, source: &str, chunk_config: &ChunkConfig) -> Vec<CodeChunk> {
323    sliding_windows(path, source, chunk_config)
324}
325
326/// Return true for RDF-family text formats without a stable Rust tree-sitter grammar.
327#[must_use]
328pub fn is_rdf_text_extension(ext: &str) -> bool {
329    matches!(
330        ext.to_ascii_lowercase().as_str(),
331        "ttl" | "nt" | "n3" | "trig" | "nq"
332    )
333}
334
335/// Chunk Turtle/N-Triples/TriG/N-Quads style RDF by statement blocks.
336///
337/// RDF text formats are denser than prose but often lack a mature packaged
338/// tree-sitter grammar. This keeps prefixes together and groups multi-line
339/// subject statements ending in `.` so ontology classes and predicates remain
340/// intact for embedding.
341#[must_use]
342pub fn chunk_rdf_text(path: &Path, source: &str, chunk_config: &ChunkConfig) -> Vec<CodeChunk> {
343    if source.trim().is_empty() {
344        return vec![];
345    }
346
347    let mut chunks = Vec::new();
348    let mut current = String::new();
349    let mut current_start_line = 1usize;
350    let mut current_is_directive = false;
351
352    for (line_idx, line) in source.lines().enumerate() {
353        let line_no = line_idx + 1;
354        let trimmed = line.trim();
355        if trimmed.is_empty() {
356            flush_rdf_block(
357                path,
358                &current,
359                current_start_line,
360                chunk_config,
361                &mut chunks,
362            );
363            current.clear();
364            current_is_directive = false;
365            continue;
366        }
367
368        let line_is_directive = is_rdf_directive(trimmed);
369        if !current.is_empty() && current_is_directive && !line_is_directive {
370            flush_rdf_block(
371                path,
372                &current,
373                current_start_line,
374                chunk_config,
375                &mut chunks,
376            );
377            current.clear();
378            current_is_directive = false;
379        }
380
381        if current.is_empty() {
382            current_start_line = line_no;
383            current_is_directive = line_is_directive;
384        }
385        current.push_str(line);
386        current.push('\n');
387
388        if !current_is_directive && trimmed.ends_with('.') {
389            flush_rdf_block(
390                path,
391                &current,
392                current_start_line,
393                chunk_config,
394                &mut chunks,
395            );
396            current.clear();
397            current_is_directive = false;
398        }
399    }
400
401    flush_rdf_block(
402        path,
403        &current,
404        current_start_line,
405        chunk_config,
406        &mut chunks,
407    );
408    if chunks.is_empty() {
409        sliding_windows(path, source, chunk_config)
410    } else {
411        chunks
412    }
413}
414
415/// Chunk a source file according to its path extension.
416#[must_use]
417pub fn chunk_source_for_path(
418    path: &Path,
419    source: &str,
420    text_mode: bool,
421    chunk_config: &ChunkConfig,
422) -> Vec<CodeChunk> {
423    if text_mode {
424        return chunk_text(path, source, chunk_config);
425    }
426
427    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
428    if let Some(lang_config) = crate::languages::config_for_extension(ext) {
429        chunk_file(path, source, &lang_config, chunk_config)
430    } else if is_rdf_text_extension(ext) {
431        chunk_rdf_text(path, source, chunk_config)
432    } else {
433        chunk_text(path, source, chunk_config)
434    }
435}
436
437fn is_rdf_directive(trimmed: &str) -> bool {
438    trimmed.starts_with("@prefix")
439        || trimmed.starts_with("@base")
440        || trimmed.starts_with("PREFIX")
441        || trimmed.starts_with("BASE")
442}
443
444fn flush_rdf_block(
445    path: &Path,
446    content: &str,
447    start_line: usize,
448    chunk_config: &ChunkConfig,
449    chunks: &mut Vec<CodeChunk>,
450) {
451    let trimmed = content.trim();
452    if trimmed.is_empty() {
453        return;
454    }
455    let name = rdf_block_name(trimmed, path);
456    let content = format!("{trimmed}\n");
457    if content.len() > chunk_config.max_chunk_bytes {
458        chunks.extend(sliding_window_chunks(
459            &content,
460            path,
461            &name,
462            start_line,
463            chunk_config,
464        ));
465        return;
466    }
467    let header = format!("# {} | rdf: {name}\n", path.display());
468    let enriched_content = if header.len() + content.len() <= chunk_config.max_chunk_bytes {
469        format!("{header}{content}")
470    } else {
471        content.clone()
472    };
473    let line_count = content.lines().count().max(1);
474    chunks.push(CodeChunk {
475        file_path: path.display().to_string(),
476        name,
477        kind: "rdf_statements".to_string(),
478        start_line,
479        end_line: start_line + line_count - 1,
480        enriched_content,
481        content,
482    });
483}
484
485fn rdf_block_name(content: &str, path: &Path) -> String {
486    let first = content
487        .lines()
488        .map(str::trim)
489        .find(|line| !line.is_empty() && !line.starts_with('#'));
490    let Some(first) = first else {
491        return path
492            .file_name()
493            .unwrap_or_default()
494            .to_string_lossy()
495            .to_string();
496    };
497
498    if first.starts_with("@prefix") || first.starts_with("PREFIX") {
499        return "@prefix".to_string();
500    }
501    if first.starts_with("@base") || first.starts_with("BASE") {
502        return "@base".to_string();
503    }
504
505    let token = first
506        .split_whitespace()
507        .next()
508        .unwrap_or("")
509        .trim_end_matches([';', ',', '.']);
510    if token.is_empty() {
511        path.file_name()
512            .unwrap_or_default()
513            .to_string_lossy()
514            .to_string()
515    } else {
516        token.to_string()
517    }
518}
519
520/// Internal sliding-window implementation.
521fn sliding_windows(path: &Path, source: &str, chunk_config: &ChunkConfig) -> Vec<CodeChunk> {
522    if source.trim().is_empty() {
523        return vec![];
524    }
525
526    // Small enough for a single chunk
527    if source.len() <= chunk_config.max_chunk_bytes {
528        let content = source.to_string();
529        return vec![CodeChunk {
530            file_path: path.display().to_string(),
531            name: path
532                .file_name()
533                .unwrap_or_default()
534                .to_string_lossy()
535                .to_string(),
536            kind: "file".to_string(),
537            start_line: 1,
538            end_line: source.lines().count(),
539            enriched_content: content.clone(),
540            content,
541        }];
542    }
543
544    let file_name = path
545        .file_name()
546        .unwrap_or_default()
547        .to_string_lossy()
548        .to_string();
549    sliding_window_chunks(source, path, &file_name, 1, chunk_config)
550}
551
552/// Split a named definition into overlapping windows.
553///
554/// Used when a single tree-sitter match (e.g., a large function) exceeds
555/// `chunk_config.max_chunk_bytes`. Windows carry the definition name for search context.
556fn sliding_windows_with_name(
557    path: &Path,
558    content: &str,
559    name: &str,
560    base_line: usize,
561    chunk_config: &ChunkConfig,
562) -> Vec<CodeChunk> {
563    sliding_window_chunks(content, path, name, base_line, chunk_config)
564}
565
566/// Shared sliding-window loop used by both [`sliding_windows`] and
567/// [`sliding_windows_with_name`].
568///
569/// Splits `source` into overlapping windows of `chunk_config.window_size` bytes,
570/// snapping boundaries to line breaks. Each chunk is tagged with `name_prefix`
571/// and an index suffix (e.g., `"main[0]"`, `"main[1]"`).
572fn sliding_window_chunks(
573    source: &str,
574    file_path: &Path,
575    name_prefix: &str,
576    base_line: usize,
577    chunk_config: &ChunkConfig,
578) -> Vec<CodeChunk> {
579    let step = chunk_config
580        .window_size
581        .saturating_sub(chunk_config.window_overlap)
582        .max(1);
583    let bytes = source.as_bytes();
584    let mut chunks = Vec::new();
585    let mut offset = 0;
586    let mut window_idx = 0;
587
588    while offset < bytes.len() {
589        let raw_end = (offset + chunk_config.window_size).min(bytes.len());
590
591        // Snap end to a line boundary (don't split mid-line)
592        let end = if raw_end < bytes.len() {
593            match bytes[offset..raw_end].iter().rposition(|&b| b == b'\n') {
594                Some(pos) => offset + pos + 1,
595                None => raw_end, // no newline found, use raw end
596            }
597        } else {
598            raw_end
599        };
600
601        // Extract window as str (skip invalid UTF-8)
602        if let Ok(window) = std::str::from_utf8(&bytes[offset..end])
603            && !window.trim().is_empty()
604        {
605            let start_line = base_line + source[..offset].matches('\n').count();
606            let content_lines = window.lines().count().max(1);
607            let end_line = start_line + content_lines - 1;
608            let content = window.to_string();
609            chunks.push(CodeChunk {
610                file_path: file_path.display().to_string(),
611                name: format!("{name_prefix}[{window_idx}]"),
612                kind: "window".to_string(),
613                start_line,
614                end_line,
615                enriched_content: content.clone(),
616                content,
617            });
618            window_idx += 1;
619        }
620
621        offset += step;
622    }
623
624    chunks
625}
626
627#[cfg(test)]
628mod tests {
629    use super::*;
630    use std::fmt::Write as _;
631    use std::path::Path;
632
633    #[test]
634    fn chunks_rust_functions_and_structs() {
635        let source = "fn hello() { println!(\"hi\"); }\nfn world() {}\nstruct Foo { x: i32 }";
636        let config = crate::languages::config_for_extension("rs").unwrap();
637        let chunks = chunk_file(
638            Path::new("test.rs"),
639            source,
640            &config,
641            &ChunkConfig::default(),
642        );
643        assert!(
644            chunks.len() >= 2,
645            "expected at least 2 chunks, got {}",
646            chunks.len()
647        );
648        assert!(chunks.iter().any(|c| c.name == "hello"));
649        assert!(chunks.iter().any(|c| c.name == "world"));
650    }
651
652    #[test]
653    fn chunks_python_functions_and_classes() {
654        let source = "def greet(name):\n    pass\n\nclass Foo:\n    pass\n";
655        let config = crate::languages::config_for_extension("py").unwrap();
656        let chunks = chunk_file(
657            Path::new("test.py"),
658            source,
659            &config,
660            &ChunkConfig::default(),
661        );
662        assert!(chunks.len() >= 2);
663        assert!(chunks.iter().any(|c| c.name == "greet"));
664        assert!(chunks.iter().any(|c| c.name == "Foo"));
665    }
666
667    #[test]
668    fn chunks_python_stub_functions_and_classes() {
669        let source = "from typing import Protocol\n\ndef greet(name: str) -> str: ...\n\nclass Foo(Protocol):\n    value: int\n";
670        let config = crate::languages::config_for_extension("pyi").unwrap();
671        let chunks = chunk_file(
672            Path::new("test.pyi"),
673            source,
674            &config,
675            &ChunkConfig::default(),
676        );
677        assert!(chunks.len() >= 2);
678        assert!(chunks.iter().any(|c| c.name == "greet"));
679        assert!(chunks.iter().any(|c| c.name == "Foo"));
680    }
681
682    #[test]
683    fn fallback_small_file_single_chunk() {
684        // With enriched queries, `let x = 42` matches variable_declarator.
685        // Use a source with NO tree-sitter captures to test the plaintext fallback.
686        let source = "// just a comment\n// and another\n";
687        let config = crate::languages::config_for_extension("js").unwrap();
688        let chunks = chunk_file(
689            Path::new("script.js"),
690            source,
691            &config,
692            &ChunkConfig::default(),
693        );
694        assert_eq!(chunks.len(), 1);
695        assert_eq!(chunks[0].kind, "file");
696    }
697
698    #[test]
699    fn fallback_large_file_produces_windows() {
700        // Create a file larger than default max_chunk_bytes with no function declarations
701        let line = "console.log('hello world, this is a long line of javascript code');\n";
702        let source: String = line.repeat(200); // ~13KB
703        let chunk_config = ChunkConfig::default();
704        assert!(source.len() > chunk_config.max_chunk_bytes);
705
706        let config = crate::languages::config_for_extension("js").unwrap();
707        let chunks = chunk_file(Path::new("big.js"), &source, &config, &chunk_config);
708        assert!(
709            chunks.len() > 1,
710            "expected multiple windows, got {}",
711            chunks.len()
712        );
713        assert!(chunks.iter().all(|c| c.kind == "window"));
714        assert!(chunks[0].name.contains("[0]"));
715    }
716
717    #[test]
718    fn large_definition_is_windowed() {
719        // A Rust function larger than default max_chunk_bytes
720        let mut source = String::from("fn big_function() {\n");
721        for i in 0..200 {
722            writeln!(source, "    let var_{i} = {i} * 2 + 1; // some computation").unwrap();
723        }
724        source.push_str("}\n");
725        let chunk_config = ChunkConfig::default();
726        assert!(source.len() > chunk_config.max_chunk_bytes);
727
728        let config = crate::languages::config_for_extension("rs").unwrap();
729        let chunks = chunk_file(Path::new("test.rs"), &source, &config, &chunk_config);
730        assert!(
731            chunks.len() > 1,
732            "expected windowed chunks, got {}",
733            chunks.len()
734        );
735        assert!(chunks[0].name.starts_with("big_function["));
736    }
737
738    #[test]
739    fn empty_file_produces_no_chunks() {
740        let config = crate::languages::config_for_extension("rs").unwrap();
741        let chunks = chunk_file(Path::new("empty.rs"), "", &config, &ChunkConfig::default());
742        assert!(chunks.is_empty());
743    }
744
745    // --- T1 enrichment tests ---
746
747    /// Helper: parse source with tree-sitter and return the first `@def` node.
748    fn first_def_node(
749        source: &str,
750        ext: &str,
751    ) -> (
752        tree_sitter::Tree,
753        std::sync::Arc<crate::languages::LangConfig>,
754    ) {
755        let config = crate::languages::config_for_extension(ext).unwrap();
756        let mut parser = Parser::new();
757        parser.set_language(&config.language).unwrap();
758        let tree = parser.parse(source, None).unwrap();
759        (tree, config)
760    }
761
762    #[test]
763    fn scope_chain_rust_impl_method() {
764        let source = "impl Foo {\n    fn bar(&self) {}\n}";
765        let (tree, config) = first_def_node(source, "rs");
766        let mut cursor = QueryCursor::new();
767        let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
768
769        let mut def_node = None;
770        while let Some(m) = StreamingIterator::next(&mut matches) {
771            for cap in m.captures {
772                let cap_name = &config.query.capture_names()[cap.index as usize];
773                if *cap_name == "def" {
774                    def_node = Some(cap.node);
775                }
776            }
777        }
778        let node = def_node.expect("should find a @def node");
779        let scope = build_scope_chain(node, source);
780        assert!(
781            scope.contains("impl_item"),
782            "scope should contain impl_item, got: {scope}"
783        );
784        assert!(
785            scope.contains("Foo"),
786            "scope should contain 'Foo', got: {scope}"
787        );
788    }
789
790    #[test]
791    fn scope_chain_python_class_method() {
792        let source = "class Greeter:\n    def say_hello(self):\n        pass\n";
793        let (tree, config) = first_def_node(source, "py");
794        let mut cursor = QueryCursor::new();
795        let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
796
797        // Find the function_definition @def (say_hello), not the class @def
798        let mut fn_node = None;
799        while let Some(m) = StreamingIterator::next(&mut matches) {
800            for cap in m.captures {
801                let cap_name = &config.query.capture_names()[cap.index as usize];
802                if *cap_name == "def" && cap.node.kind() == "function_definition" {
803                    fn_node = Some(cap.node);
804                }
805            }
806        }
807        let node = fn_node.expect("should find say_hello @def node");
808        let scope = build_scope_chain(node, source);
809        assert!(
810            scope.contains("class_definition"),
811            "scope should contain class_definition, got: {scope}"
812        );
813        assert!(
814            scope.contains("Greeter"),
815            "scope should contain 'Greeter', got: {scope}"
816        );
817    }
818
819    #[test]
820    fn extract_signature_rust_function() {
821        let source = "fn greet(name: &str) -> String { name.to_string() }";
822        let (tree, config) = first_def_node(source, "rs");
823        let mut cursor = QueryCursor::new();
824        let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
825
826        let mut def_node = None;
827        while let Some(m) = StreamingIterator::next(&mut matches) {
828            for cap in m.captures {
829                let cap_name = &config.query.capture_names()[cap.index as usize];
830                if *cap_name == "def" {
831                    def_node = Some(cap.node);
832                }
833            }
834        }
835        let node = def_node.expect("should find @def node");
836        let sig = extract_signature(node, source).expect("should extract signature");
837        assert!(
838            sig.contains("greet"),
839            "signature should contain 'greet', got: {sig}"
840        );
841        assert!(
842            sig.contains("name: &str"),
843            "signature should contain parameter, got: {sig}"
844        );
845        assert!(
846            sig.contains("-> String"),
847            "signature should contain return type, got: {sig}"
848        );
849    }
850
851    #[test]
852    fn enriched_content_has_header() {
853        let source = "fn hello() { println!(\"hi\"); }";
854        let config = crate::languages::config_for_extension("rs").unwrap();
855        let chunks = chunk_file(
856            Path::new("src/main.rs"),
857            source,
858            &config,
859            &ChunkConfig::default(),
860        );
861        assert!(!chunks.is_empty());
862        let chunk = &chunks[0];
863        assert!(
864            chunk.enriched_content.starts_with("//"),
865            "enriched_content should start with '//' header, got: {}",
866            &chunk.enriched_content[..chunk.enriched_content.len().min(80)]
867        );
868        assert!(
869            chunk.enriched_content.contains("src/main.rs"),
870            "enriched_content should contain file path"
871        );
872        // Raw content should NOT have the header
873        assert!(
874            !chunk.content.starts_with("//"),
875            "raw content should not start with header"
876        );
877    }
878
879    #[test]
880    fn sliding_window_enriched_equals_content() {
881        let source = "let x = 42;\nconsole.log(x);\n";
882        let chunks = chunk_text(Path::new("test.txt"), source, &ChunkConfig::default());
883        assert!(!chunks.is_empty());
884        for chunk in &chunks {
885            assert_eq!(
886                chunk.enriched_content, chunk.content,
887                "sliding window chunks should have enriched_content == content"
888            );
889        }
890    }
891
892    #[test]
893    fn chunks_rdf_xml_and_owl_elements_with_tree_sitter() {
894        let source = r#"<?xml version="1.0"?>
895<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
896         xmlns:owl="http://www.w3.org/2002/07/owl#">
897  <owl:Class rdf:about="http://example.com/Person"/>
898  <owl:ObjectProperty rdf:about="http://example.com/knows"/>
899</rdf:RDF>"#;
900        let rdf_config = crate::languages::config_for_extension("rdf").unwrap();
901        let owl_config = crate::languages::config_for_extension("owl").unwrap();
902
903        let rdf_chunks = chunk_file(
904            Path::new("ontology.rdf"),
905            source,
906            &rdf_config,
907            &ChunkConfig::default(),
908        );
909        let owl_chunks = chunk_file(
910            Path::new("ontology.owl"),
911            source,
912            &owl_config,
913            &ChunkConfig::default(),
914        );
915
916        assert!(rdf_chunks.iter().any(|chunk| chunk.name == "owl:Class"));
917        assert!(
918            rdf_chunks
919                .iter()
920                .any(|chunk| chunk.name == "owl:ObjectProperty")
921        );
922        assert!(rdf_chunks.iter().all(|chunk| chunk.kind == "element"));
923        assert!(owl_chunks.iter().any(|chunk| chunk.name == "owl:Class"));
924    }
925
926    #[test]
927    fn chunks_turtle_by_rdf_statement_blocks() {
928        let source = r#"@prefix ex: <http://example.com/> .
929@prefix owl: <http://www.w3.org/2002/07/owl#> .
930
931ex:Person
932  a owl:Class ;
933  ex:label "Person" .
934
935ex:knows
936  a owl:ObjectProperty ;
937  ex:domain ex:Person ;
938  ex:range ex:Person .
939"#;
940
941        let chunks = chunk_rdf_text(Path::new("ontology.ttl"), source, &ChunkConfig::default());
942
943        assert_eq!(chunks.len(), 3);
944        assert_eq!(chunks[0].kind, "rdf_statements");
945        assert_eq!(chunks[0].name, "@prefix");
946        assert_eq!(chunks[1].name, "ex:Person");
947        assert_eq!(chunks[2].name, "ex:knows");
948    }
949
950    #[test]
951    fn header_dropped_when_exceeding_max_bytes() {
952        // Create a chunk that barely fits in max_chunk_bytes, so adding
953        // a header would push it over the limit.
954        let tiny_config = ChunkConfig {
955            max_chunk_bytes: 60,
956            window_size: 30,
957            window_overlap: 10,
958        };
959        // Source is exactly at max_chunk_bytes — any header would exceed it
960        let source = "fn f() { let x = 42; return x; }";
961        assert!(source.len() <= tiny_config.max_chunk_bytes);
962
963        let config = crate::languages::config_for_extension("rs").unwrap();
964        let chunks = chunk_file(
965            Path::new("long/path/to/file.rs"),
966            source,
967            &config,
968            &tiny_config,
969        );
970        assert!(!chunks.is_empty());
971        let chunk = &chunks[0];
972        // Header ("// long/path/to/file.rs | defines: ...") + minified content > 60 bytes.
973        // So enriched_content should fall back to minified content (no header),
974        // and raw content is preserved as-is.
975        assert!(
976            !chunk.enriched_content.starts_with("//"),
977            "header should be dropped when it would exceed max_chunk_bytes"
978        );
979        assert_eq!(chunk.content, source, "raw content should be unchanged");
980    }
981
982    #[test]
983    fn minify_whitespace_normalizes_indent_and_strips_trailing() {
984        // 8-space indent → 4-space (halved)
985        let source = "fn foo() {\n        let x = 1;\n        let y = 2;\n}\n";
986        let result = minify_whitespace(source);
987        let lines: Vec<&str> = result.lines().collect();
988        assert_eq!(
989            lines[1], "    let x = 1;",
990            "8-space indent should become 4-space"
991        );
992        assert_eq!(
993            lines[2], "    let y = 2;",
994            "8-space indent should become 4-space"
995        );
996
997        // Trailing whitespace removed
998        let with_trailing = "fn bar()   \n    return 1;   \n";
999        let result2 = minify_whitespace(with_trailing);
1000        assert!(
1001            result2.lines().all(|l| !l.ends_with(' ')),
1002            "trailing whitespace should be stripped"
1003        );
1004
1005        // 3+ consecutive blank lines collapsed to 1
1006        let with_blanks = "a\n\n\n\nb\n";
1007        let result3 = minify_whitespace(with_blanks);
1008        // Should have at most 1 blank line between a and b
1009        let blank_runs: Vec<usize> = {
1010            let mut runs = Vec::new();
1011            let mut count = 0usize;
1012            for line in result3.lines() {
1013                if line.is_empty() {
1014                    count += 1;
1015                } else {
1016                    if count > 0 {
1017                        runs.push(count);
1018                    }
1019                    count = 0;
1020                }
1021            }
1022            runs
1023        };
1024        assert!(
1025            blank_runs.iter().all(|&n| n <= 1),
1026            "3+ blank lines should collapse to 1, got runs: {blank_runs:?}"
1027        );
1028    }
1029}