Skip to main content

ripvec_core/
chunk.rs

1//! Tree-sitter based code chunking with sliding-window fallback.
2//!
3//! Parses source files into ASTs and extracts semantic chunks at
4//! function, class, and method boundaries. For files without recognized
5//! semantic structure (or very large fallback chunks), splits into
6//! overlapping sliding windows for uniform embedding sizes.
7
8use std::path::Path;
9use streaming_iterator::StreamingIterator;
10use tree_sitter::{Parser, QueryCursor};
11
12/// High-level content category of a chunked file.
13///
14/// Assigned at chunk-emission time based on the source file's extension.
15/// Used by downstream tools (`get_repo_map`, `find_duplicates`, `search`)
16/// to filter corpus slices without re-examining every chunk's content.
17///
18/// Serialized via `rkyv` and `bitcode` alongside [`CodeChunk`] so the
19/// value survives the in-memory index round-trip without re-derivation.
20///
21/// The default is [`ContentKind::Code`] so that test fixtures and
22/// pre-existing construction sites that don't yet specify a kind continue
23/// to behave as before (code-centric queries include them by default).
24#[derive(
25    Debug,
26    Clone,
27    Copy,
28    Default,
29    PartialEq,
30    Eq,
31    Hash,
32    rkyv::Archive,
33    rkyv::Serialize,
34    rkyv::Deserialize,
35    bitcode::Encode,
36    bitcode::Decode,
37)]
38pub enum ContentKind {
39    /// Source code files (`.rs`, `.py`, `.ts`, `.go`, etc.).
40    ///
41    /// This is the default variant so that `..Default::default()` in struct
42    /// update expressions produces `ContentKind::Code`.
43    #[default]
44    Code,
45    /// Human-readable documentation (`.md`, `.rst`, `.txt`, `.adoc`, `.org`).
46    Docs,
47    /// Machine-generated or structured data files (`.json`, `.yaml`, `.toml`,
48    /// `.xml`, `.lock`, `.snap`, `.csv`, `.tsv`, `.proto`).
49    Meta,
50}
51
52impl ContentKind {
53    /// Classify a file extension (without leading dot, case-insensitive) into
54    /// a [`ContentKind`].
55    ///
56    /// Unknown extensions default to [`ContentKind::Code`] so that novel
57    /// source files are included in code search paths rather than silently
58    /// filtered.
59    #[must_use]
60    pub fn from_extension(ext: &str) -> Self {
61        match ext.to_ascii_lowercase().as_str() {
62            // -- Docs --
63            "md" | "rst" | "txt" | "adoc" | "asciidoc" | "org" => Self::Docs,
64            // -- Meta (structured data / generated files) --
65            // HCL: .tfvars carries environment variable values (per-env
66            // settings), not application code. Classifying as Meta makes
67            // find_duplicates(include_metadata=false) drop the noisy
68            // F1 Environment Fan-Out clusters by default (R4, Wave 3).
69            "json" | "yaml" | "yml" | "toml" | "xml" | "lock" | "snap" | "csv" | "tsv"
70            | "proto" | "rdf" | "owl" | "tfvars" => Self::Meta,
71            // -- Code (all recognised source extensions, plus unknown = code) --
72            _ => Self::Code,
73        }
74    }
75}
76
77/// Runtime configuration for the chunking pipeline.
78///
79/// All size values are in bytes. Pass to [`chunk_file`] to control
80/// chunk sizing without recompilation.
81#[derive(Debug, Clone)]
82pub struct ChunkConfig {
83    /// Maximum chunk content length in bytes before splitting into windows.
84    /// Chunks larger than this are split even if tree-sitter found them as
85    /// a single definition (e.g., a 500-line function).
86    pub max_chunk_bytes: usize,
87    /// Target window size in bytes for the sliding-window fallback chunker.
88    /// ~2KB of source text ≈ 128-256 tokens after BPE, well within the
89    /// 512-token model limit and optimal for embedding quality.
90    pub window_size: usize,
91    /// Overlap between adjacent windows in bytes.
92    /// Ensures definitions spanning a window boundary are captured in at
93    /// least one window. Defaults to 25% of `window_size`.
94    pub window_overlap: usize,
95}
96
97impl Default for ChunkConfig {
98    fn default() -> Self {
99        Self {
100            max_chunk_bytes: 4096,
101            window_size: 2048,
102            window_overlap: 512,
103        }
104    }
105}
106
107/// A semantic chunk extracted from a source file.
108#[derive(
109    Debug,
110    Clone,
111    rkyv::Archive,
112    rkyv::Serialize,
113    rkyv::Deserialize,
114    bitcode::Encode,
115    bitcode::Decode,
116)]
117pub struct CodeChunk {
118    /// Path to the source file.
119    pub file_path: String,
120    /// Name of the definition (function name, class name, etc.).
121    pub name: String,
122    /// Kind of syntax node (e.g., `function_item`, `class_definition`).
123    /// For chunks emitted from the `dense.rs` semble path, this is the
124    /// LSP SymbolKind numeric value serialized as a string (e.g., `"12"` for
125    /// Function). For chunks emitted from `chunk.rs`'s query-based path, this
126    /// is the raw tree-sitter node kind string. Downstream consumers should
127    /// call [`crate::languages::lsp_symbol_kind_for_node_kind`] to normalize.
128    pub kind: String,
129    /// High-level content category derived from the file extension.
130    ///
131    /// Set by the chunker at emission time. Downstream filters (e.g.,
132    /// `find_duplicates`, `get_repo_map`) use this to exclude
133    /// [`ContentKind::Meta`] files from code-centric queries by default.
134    pub content_kind: ContentKind,
135    /// 1-based start line number (first line of the chunk, which may be a
136    /// doc-comment line preceding the symbol's identifier).
137    pub start_line: usize,
138    /// 1-based end line number.
139    pub end_line: usize,
140    /// 1-based line number of the symbol's name-identifier token within this
141    /// chunk. For tree-sitter query-based chunks (`chunk_file`), this is the
142    /// exact line where the `@name` capture appears, so LSP tools that jump to
143    /// `lsp_location.start_line` land on the identifier rather than on a
144    /// preceding doc-comment line.
145    ///
146    /// Defaults to `start_line` for chunks produced by the sliding-window
147    /// fallback, the semble/dense path, and RDF-block paths, where the
148    /// name-identifier line is not separately tracked.
149    ///
150    /// Convention matches `start_line`: 1-based, relative to the file's first
151    /// line.
152    pub symbol_line: usize,
153    /// Source text of the chunk (raw code for display).
154    pub content: String,
155    /// Enriched content with scope chain and signature metadata for embedding.
156    /// Falls back to `content` if metadata would exceed chunk size limits.
157    pub enriched_content: String,
158    /// Optional language-specific composite identifier that disambiguates a
159    /// symbol whose bare `name` is shared across many declarations.
160    ///
161    /// For HCL: `resource "aws_iam_role" "loader" { ... }` ships
162    /// `name = "loader"` and `qualified_name = Some("aws_iam_role.loader")`.
163    /// Each `local.X = ...` extracted as its own chunk ships
164    /// `qualified_name = Some("local.X")`. Workspace symbol consumers
165    /// surface `qualified_name` when set so a query for the composite form
166    /// finds the right resource on corpora that reuse names like `loader`
167    /// across many resource types (R1, R6, Wave 3).
168    ///
169    /// `None` for all non-HCL chunks today; reserved for future
170    /// language-specific composite naming.
171    pub qualified_name: Option<String>,
172}
173
174/// Walk up the AST parent chain collecting structural container names.
175///
176/// Produces a scope chain like `"impl_item Foo > fn forward"` by
177/// identifying structural containers (impl blocks, classes, modules, namespaces)
178/// and extracting their names. Tries the `name` field first, then `type`
179/// (for Rust `impl_item` which uses `type` instead of `name`).
180#[must_use]
181pub fn build_scope_chain(node: tree_sitter::Node<'_>, source: &str) -> String {
182    /// Node kinds that represent structural containers, by language.
183    const CONTAINER_KINDS: &[&str] = &[
184        // Rust
185        "impl_item",
186        "trait_item",
187        "mod_item",
188        // Python
189        "class_definition",
190        "module",
191        // JS/TS
192        "class_declaration",
193        // Java
194        // "class_declaration" already covered above
195        // Go
196        "type_declaration",
197        // C++
198        "namespace_definition",
199        "class_specifier",
200    ];
201
202    /// Field names to try when extracting the container name.
203    /// `impl_item` uses `type` instead of `name`; Go `type_declaration`
204    /// has no fields, so we fall back to the node kind.
205    const NAME_FIELDS: &[&str] = &["name", "type"];
206
207    let mut parts = Vec::new();
208    let mut current = node.parent();
209    while let Some(parent) = current {
210        let kind = parent.kind();
211        if CONTAINER_KINDS.contains(&kind) {
212            let name = NAME_FIELDS
213                .iter()
214                .find_map(|field| parent.child_by_field_name(field))
215                .map_or(kind, |n| &source[n.start_byte()..n.end_byte()]);
216            parts.push(format!("{kind} {name}"));
217        }
218        current = parent.parent();
219    }
220    parts.reverse();
221    parts.join(" > ")
222}
223
224/// Extract the function/method signature from a definition node.
225///
226/// Returns the text from the function name to the start of the body,
227/// which captures the parameter list and return type (if any).
228/// Returns `None` if the node has no `name` or `body`/`block` field.
229#[must_use]
230pub fn extract_signature(node: tree_sitter::Node<'_>, source: &str) -> Option<String> {
231    let name_node = node.child_by_field_name("name")?;
232    let body_node = node
233        .child_by_field_name("body")
234        .or_else(|| node.child_by_field_name("block"))?;
235    let start = name_node.start_byte();
236    let end = body_node.start_byte();
237    if start >= end {
238        return None;
239    }
240    let sig = source[start..end].trim();
241    if sig.is_empty() {
242        None
243    } else {
244        Some(sig.to_string())
245    }
246}
247
248/// Reduce indentation waste for embedding by normalizing whitespace.
249///
250/// For each line:
251/// - Counts leading spaces/tabs, normalises to 2 spaces per indent level
252///   (4 spaces → 2, 8 spaces → 4, 1 tab → 2 spaces).
253/// - Strips trailing whitespace.
254///
255/// Additionally, 3 or more consecutive blank lines are collapsed to a single
256/// blank line. This reduces the number of whitespace tokens consumed in the
257/// 512-token embedding window without altering visible structure.
258#[must_use]
259pub fn minify_whitespace(source: &str) -> String {
260    let mut result = String::with_capacity(source.len());
261    let mut consecutive_blank = 0usize;
262
263    for line in source.lines() {
264        // Count leading whitespace and determine indent level
265        let leading = line
266            .chars()
267            .take_while(|c| *c == ' ' || *c == '\t')
268            .fold(0usize, |acc, c| acc + if c == '\t' { 2 } else { 1 });
269        let rest = line.trim_start();
270
271        if rest.is_empty() {
272            // Blank line handling: collapse 3+ consecutive blanks to 1.
273            // Only emit the first blank line of a run; suppress the rest.
274            consecutive_blank += 1;
275            if consecutive_blank == 1 {
276                result.push('\n');
277            }
278        } else {
279            consecutive_blank = 0;
280            // Normalise: every 2 spaces of original indent → 1 space of output
281            // (round up so indent level 1 → 1 space, level 2 → 2, etc.)
282            let indent_level = leading.div_ceil(2);
283            for _ in 0..indent_level {
284                result.push(' ');
285            }
286            result.push_str(rest.trim_end());
287            result.push('\n');
288        }
289    }
290
291    // Remove trailing newline added for the last line if source didn't end with one
292    if !source.ends_with('\n') && result.ends_with('\n') {
293        result.pop();
294    }
295
296    result
297}
298
299/// Build the enriched content header for a code chunk.
300///
301/// Prepends scope chain and signature metadata as a comment line.
302/// If the header + content would exceed `max_bytes`, returns `content` unchanged.
303fn build_enriched_content(
304    path: &Path,
305    node: tree_sitter::Node<'_>,
306    source: &str,
307    content: &str,
308    max_bytes: usize,
309) -> String {
310    let scope = build_scope_chain(node, source);
311    let sig = extract_signature(node, source).unwrap_or_default();
312    let rel_path = path.display().to_string();
313
314    let header = if scope.is_empty() && sig.is_empty() {
315        format!("// {rel_path}\n")
316    } else if scope.is_empty() {
317        format!("// {rel_path} | defines: {sig}\n")
318    } else if sig.is_empty() {
319        format!("// {rel_path} | {scope}\n")
320    } else {
321        format!("// {rel_path} | {scope} | defines: {sig}\n")
322    };
323
324    // Minify whitespace for the embedding content to reduce token waste.
325    // The raw `content` field is kept as-is for display.
326    let minified = minify_whitespace(content);
327
328    if header.len() + minified.len() > max_bytes {
329        minified
330    } else {
331        format!("{header}{minified}")
332    }
333}
334
335/// Extract semantic chunks from a source file.
336///
337/// Uses tree-sitter to parse the file and extract definitions matching
338/// the language's query patterns. For files with no semantic matches,
339/// falls back to overlapping sliding windows. Large individual chunks
340/// are also split into windows.
341///
342/// Pass a [`ChunkConfig`] to control chunk sizing at runtime.
343#[must_use]
344pub fn chunk_file(
345    path: &Path,
346    source: &str,
347    config: &crate::languages::LangConfig,
348    chunk_config: &ChunkConfig,
349) -> Vec<CodeChunk> {
350    let mut parser = Parser::new();
351    if parser.set_language(&config.language).is_err() {
352        return sliding_windows(path, source, chunk_config);
353    }
354
355    let Some(tree) = parser.parse(source, None) else {
356        return sliding_windows(path, source, chunk_config);
357    };
358
359    let mut cursor = QueryCursor::new();
360    let mut chunks = Vec::new();
361    let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
362
363    while let Some(m) = matches.next() {
364        let mut name = String::new();
365        // 1-based line of the `@name` capture node (identifier line).
366        // Set when the `@name` capture is found; falls back to def-node start_line.
367        let mut name_line: Option<usize> = None;
368        let mut def_node = None;
369        for cap in m.captures {
370            let cap_name = &config.query.capture_names()[cap.index as usize];
371            if *cap_name == "name" {
372                name = source[cap.node.start_byte()..cap.node.end_byte()].to_string();
373                // tree-sitter row is 0-based; convert to 1-based to match start_line.
374                name_line = Some(cap.node.start_position().row + 1);
375            } else if *cap_name == "def" {
376                def_node = Some(cap.node);
377            }
378        }
379        if let Some(node) = def_node {
380            let content = &source[node.start_byte()..node.end_byte()];
381            let start_line = node.start_position().row + 1;
382            // `symbol_line` is the identifier's line; falls back to `start_line`
383            // when the `@name` capture was not found (should not happen for
384            // well-formed queries, but defended here for robustness).
385            let symbol_line = name_line.unwrap_or(start_line);
386
387            // HCL: extract `type.name` composite for `block` nodes via the
388            // shared derive_hcl_block_name helper, and expand `locals { ... }`
389            // blocks into one chunk per attribute (R1 + R6, Wave 3).
390            let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
391            let is_hcl = matches!(ext.to_ascii_lowercase().as_str(), "tf" | "tfvars" | "hcl");
392
393            // C13W1: the HCL query now also captures `attribute` nodes inside
394            // `locals { ... }` blocks (so the dense AST-merge pipeline can name
395            // them). In `chunk_file`, the per-locals post-processing below
396            // (`emit_hcl_local_attribute_chunks`) is the canonical emission
397            // path — it ships `kind="local_attribute"` + `qualified_name="local.X"`.
398            // Skip the raw `attribute` @def match here to avoid double-emitting
399            // a parallel chunk with `kind="attribute"`.
400            if is_hcl && node.kind() == "attribute" {
401                continue;
402            }
403
404            let qualified_name = if is_hcl && node.kind() == "block" {
405                let composite = crate::languages::derive_hcl_block_name(&node, source.as_bytes());
406                if composite.is_empty() || composite == name {
407                    None
408                } else {
409                    Some(composite)
410                }
411            } else {
412                None
413            };
414
415            // R6: expand `locals { ... }` into per-attribute chunks. This
416            // produces additional chunks (one per local) but also keeps the
417            // umbrella `locals` chunk so consumers that want the block as
418            // a whole still see it.
419            if is_hcl && node.kind() == "block" && name == "locals" {
420                chunks.extend(emit_hcl_local_attribute_chunks(
421                    path,
422                    source,
423                    node,
424                    chunk_config,
425                ));
426            }
427
428            // Split oversized chunks into windows
429            if content.len() > chunk_config.max_chunk_bytes {
430                chunks.extend(sliding_windows_with_name(
431                    path,
432                    content,
433                    &name,
434                    start_line,
435                    chunk_config,
436                ));
437            } else {
438                let enriched = build_enriched_content(
439                    path,
440                    node,
441                    source,
442                    content,
443                    chunk_config.max_chunk_bytes,
444                );
445                chunks.push(CodeChunk {
446                    file_path: path.display().to_string(),
447                    name,
448                    kind: node.kind().to_string(),
449                    content_kind: ContentKind::from_extension(ext),
450                    start_line,
451                    end_line: node.end_position().row + 1,
452                    symbol_line,
453                    enriched_content: enriched,
454                    content: content.to_string(),
455                    qualified_name,
456                });
457            }
458        }
459    }
460
461    // Fallback: sliding windows if no semantic matches
462    if chunks.is_empty() && !source.trim().is_empty() {
463        return sliding_windows(path, source, chunk_config);
464    }
465
466    chunks
467}
468
469/// Walk a HCL `locals { ... }` block and emit one [`CodeChunk`] per
470/// `attribute` (`local.X = value`).
471///
472/// Each emitted chunk has `kind = "local_attribute"` (which maps to
473/// LSP `SymbolKind::Constant` (14) via
474/// [`crate::languages::lsp_symbol_kind_for_node_kind`]), `name = X`,
475/// `qualified_name = Some("local.X")`, and `content_kind` derived from
476/// the file extension. T18 Cohesion Refraction queries can now distinguish
477/// individual locals (R6, Wave 3).
478///
479/// Only called from [`chunk_file`] when the chunker sees a `block` node
480/// whose `@name` capture is `"locals"`. Non-attribute body children
481/// (whitespace, comments) are skipped.
482fn emit_hcl_local_attribute_chunks(
483    path: &Path,
484    source: &str,
485    locals_block: tree_sitter::Node<'_>,
486    chunk_config: &ChunkConfig,
487) -> Vec<CodeChunk> {
488    let mut out = Vec::new();
489    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
490    let content_kind = ContentKind::from_extension(ext);
491
492    // Walk children: identifier, block_start, body, block_end. The body
493    // contains the `attribute` children we care about.
494    let mut cursor = locals_block.walk();
495    for child in locals_block.children(&mut cursor) {
496        if child.kind() != "body" {
497            continue;
498        }
499        let mut body_cursor = child.walk();
500        for attr in child.children(&mut body_cursor) {
501            if attr.kind() != "attribute" {
502                continue;
503            }
504            // Find the identifier (local name) and stop at the first.
505            let mut name_node: Option<tree_sitter::Node<'_>> = None;
506            let mut attr_cursor = attr.walk();
507            for grandchild in attr.children(&mut attr_cursor) {
508                if grandchild.kind() == "identifier" {
509                    name_node = Some(grandchild);
510                    break;
511                }
512            }
513            let Some(id_node) = name_node else {
514                continue;
515            };
516            let name_text = source[id_node.start_byte()..id_node.end_byte()].to_string();
517            let content_text = source[attr.start_byte()..attr.end_byte()].to_string();
518            let start_line = attr.start_position().row + 1;
519            let end_line = attr.end_position().row + 1;
520            let symbol_line = id_node.start_position().row + 1;
521            let composite = format!("local.{name_text}");
522            let header = format!("// {} | local: {composite}\n", path.display());
523            let enriched = if header.len() + content_text.len() <= chunk_config.max_chunk_bytes {
524                format!("{header}{content_text}")
525            } else {
526                content_text.clone()
527            };
528            out.push(CodeChunk {
529                file_path: path.display().to_string(),
530                name: name_text,
531                kind: "local_attribute".to_string(),
532                content_kind,
533                start_line,
534                end_line,
535                symbol_line,
536                enriched_content: enriched,
537                content: content_text,
538                qualified_name: Some(composite),
539            });
540        }
541    }
542    out
543}
544
545/// Split source text into overlapping sliding windows.
546///
547/// Each window is `chunk_config.window_size` bytes with `chunk_config.window_overlap` bytes of
548/// overlap. Window boundaries are adjusted to line breaks to avoid
549/// splitting mid-line.
550///
551/// This is used as the fallback for files without tree-sitter support
552/// (plain text, unknown extensions) and for large semantic chunks that
553/// exceed `max_chunk_bytes`.
554#[must_use]
555pub fn chunk_text(path: &Path, source: &str, chunk_config: &ChunkConfig) -> Vec<CodeChunk> {
556    sliding_windows(path, source, chunk_config)
557}
558
559/// Return true for RDF-family text formats without a stable Rust tree-sitter grammar.
560#[must_use]
561pub fn is_rdf_text_extension(ext: &str) -> bool {
562    matches!(
563        ext.to_ascii_lowercase().as_str(),
564        "ttl" | "nt" | "n3" | "trig" | "nq"
565    )
566}
567
568/// Chunk Turtle/N-Triples/TriG/N-Quads style RDF by statement blocks.
569///
570/// RDF text formats are denser than prose but often lack a mature packaged
571/// tree-sitter grammar. This keeps prefixes together and groups multi-line
572/// subject statements ending in `.` so ontology classes and predicates remain
573/// intact for embedding.
574#[must_use]
575pub fn chunk_rdf_text(path: &Path, source: &str, chunk_config: &ChunkConfig) -> Vec<CodeChunk> {
576    if source.trim().is_empty() {
577        return vec![];
578    }
579
580    let mut chunks = Vec::new();
581    let mut current = String::new();
582    let mut current_start_line = 1usize;
583    let mut current_is_directive = false;
584
585    for (line_idx, line) in source.lines().enumerate() {
586        let line_no = line_idx + 1;
587        let trimmed = line.trim();
588        if trimmed.is_empty() {
589            flush_rdf_block(
590                path,
591                &current,
592                current_start_line,
593                chunk_config,
594                &mut chunks,
595            );
596            current.clear();
597            current_is_directive = false;
598            continue;
599        }
600
601        let line_is_directive = is_rdf_directive(trimmed);
602        if !current.is_empty() && current_is_directive && !line_is_directive {
603            flush_rdf_block(
604                path,
605                &current,
606                current_start_line,
607                chunk_config,
608                &mut chunks,
609            );
610            current.clear();
611            current_is_directive = false;
612        }
613
614        if current.is_empty() {
615            current_start_line = line_no;
616            current_is_directive = line_is_directive;
617        }
618        current.push_str(line);
619        current.push('\n');
620
621        if !current_is_directive && trimmed.ends_with('.') {
622            flush_rdf_block(
623                path,
624                &current,
625                current_start_line,
626                chunk_config,
627                &mut chunks,
628            );
629            current.clear();
630            current_is_directive = false;
631        }
632    }
633
634    flush_rdf_block(
635        path,
636        &current,
637        current_start_line,
638        chunk_config,
639        &mut chunks,
640    );
641    if chunks.is_empty() {
642        sliding_windows(path, source, chunk_config)
643    } else {
644        chunks
645    }
646}
647
648/// Chunk a source file according to its path extension.
649#[must_use]
650pub fn chunk_source_for_path(
651    path: &Path,
652    source: &str,
653    text_mode: bool,
654    chunk_config: &ChunkConfig,
655) -> Vec<CodeChunk> {
656    if text_mode {
657        return chunk_text(path, source, chunk_config);
658    }
659
660    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
661    if let Some(lang_config) = crate::languages::config_for_extension(ext) {
662        chunk_file(path, source, &lang_config, chunk_config)
663    } else if is_rdf_text_extension(ext) {
664        chunk_rdf_text(path, source, chunk_config)
665    } else {
666        chunk_text(path, source, chunk_config)
667    }
668}
669
670fn is_rdf_directive(trimmed: &str) -> bool {
671    trimmed.starts_with("@prefix")
672        || trimmed.starts_with("@base")
673        || trimmed.starts_with("PREFIX")
674        || trimmed.starts_with("BASE")
675}
676
677fn flush_rdf_block(
678    path: &Path,
679    content: &str,
680    start_line: usize,
681    chunk_config: &ChunkConfig,
682    chunks: &mut Vec<CodeChunk>,
683) {
684    let trimmed = content.trim();
685    if trimmed.is_empty() {
686        return;
687    }
688    let name = rdf_block_name(trimmed, path);
689    let content = format!("{trimmed}\n");
690    if content.len() > chunk_config.max_chunk_bytes {
691        chunks.extend(sliding_window_chunks(
692            &content,
693            path,
694            &name,
695            start_line,
696            chunk_config,
697        ));
698        return;
699    }
700    let header = format!("# {} | rdf: {name}\n", path.display());
701    let enriched_content = if header.len() + content.len() <= chunk_config.max_chunk_bytes {
702        format!("{header}{content}")
703    } else {
704        content.clone()
705    };
706    let line_count = content.lines().count().max(1);
707    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
708    chunks.push(CodeChunk {
709        file_path: path.display().to_string(),
710        name,
711        kind: "rdf_statements".to_string(),
712        content_kind: ContentKind::from_extension(ext),
713        start_line,
714        end_line: start_line + line_count - 1,
715        // RDF chunks have no name-identifier line distinct from the chunk start.
716        symbol_line: start_line,
717        enriched_content,
718        content,
719        qualified_name: None,
720    });
721}
722
723fn rdf_block_name(content: &str, path: &Path) -> String {
724    let first = content
725        .lines()
726        .map(str::trim)
727        .find(|line| !line.is_empty() && !line.starts_with('#'));
728    let Some(first) = first else {
729        return path
730            .file_name()
731            .unwrap_or_default()
732            .to_string_lossy()
733            .to_string();
734    };
735
736    if first.starts_with("@prefix") || first.starts_with("PREFIX") {
737        return "@prefix".to_string();
738    }
739    if first.starts_with("@base") || first.starts_with("BASE") {
740        return "@base".to_string();
741    }
742
743    let token = first
744        .split_whitespace()
745        .next()
746        .unwrap_or("")
747        .trim_end_matches([';', ',', '.']);
748    if token.is_empty() {
749        path.file_name()
750            .unwrap_or_default()
751            .to_string_lossy()
752            .to_string()
753    } else {
754        token.to_string()
755    }
756}
757
758/// Internal sliding-window implementation.
759fn sliding_windows(path: &Path, source: &str, chunk_config: &ChunkConfig) -> Vec<CodeChunk> {
760    if source.trim().is_empty() {
761        return vec![];
762    }
763
764    // Small enough for a single chunk
765    if source.len() <= chunk_config.max_chunk_bytes {
766        let content = source.to_string();
767        let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
768        return vec![CodeChunk {
769            file_path: path.display().to_string(),
770            name: path
771                .file_name()
772                .unwrap_or_default()
773                .to_string_lossy()
774                .to_string(),
775            kind: "file".to_string(),
776            content_kind: ContentKind::from_extension(ext),
777            start_line: 1,
778            end_line: source.lines().count(),
779            // Whole-file chunks have no distinct identifier line.
780            symbol_line: 1,
781            enriched_content: content.clone(),
782            content,
783            qualified_name: None,
784        }];
785    }
786
787    let file_name = path
788        .file_name()
789        .unwrap_or_default()
790        .to_string_lossy()
791        .to_string();
792    sliding_window_chunks(source, path, &file_name, 1, chunk_config)
793}
794
795/// Split a named definition into overlapping windows.
796///
797/// Used when a single tree-sitter match (e.g., a large function) exceeds
798/// `chunk_config.max_chunk_bytes`. Windows carry the definition name for search context.
799fn sliding_windows_with_name(
800    path: &Path,
801    content: &str,
802    name: &str,
803    base_line: usize,
804    chunk_config: &ChunkConfig,
805) -> Vec<CodeChunk> {
806    sliding_window_chunks(content, path, name, base_line, chunk_config)
807}
808
809/// Shared sliding-window loop used by both [`sliding_windows`] and
810/// [`sliding_windows_with_name`].
811///
812/// Splits `source` into overlapping windows of `chunk_config.window_size` bytes,
813/// snapping boundaries to line breaks. Each chunk is tagged with `name_prefix`
814/// and an index suffix (e.g., `"main[0]"`, `"main[1]"`).
815fn sliding_window_chunks(
816    source: &str,
817    file_path: &Path,
818    name_prefix: &str,
819    base_line: usize,
820    chunk_config: &ChunkConfig,
821) -> Vec<CodeChunk> {
822    let step = chunk_config
823        .window_size
824        .saturating_sub(chunk_config.window_overlap)
825        .max(1);
826    let bytes = source.as_bytes();
827    let mut chunks = Vec::new();
828    let mut offset = 0;
829    let mut window_idx = 0;
830
831    while offset < bytes.len() {
832        let raw_end = (offset + chunk_config.window_size).min(bytes.len());
833
834        // Snap end to a line boundary (don't split mid-line)
835        let end = if raw_end < bytes.len() {
836            match bytes[offset..raw_end].iter().rposition(|&b| b == b'\n') {
837                Some(pos) => offset + pos + 1,
838                None => raw_end, // no newline found, use raw end
839            }
840        } else {
841            raw_end
842        };
843
844        // Extract window as str (skip invalid UTF-8)
845        if let Ok(window) = std::str::from_utf8(&bytes[offset..end])
846            && !window.trim().is_empty()
847        {
848            let start_line = base_line + source[..offset].matches('\n').count();
849            let content_lines = window.lines().count().max(1);
850            let end_line = start_line + content_lines - 1;
851            let content = window.to_string();
852            let ext = file_path.extension().and_then(|e| e.to_str()).unwrap_or("");
853            chunks.push(CodeChunk {
854                file_path: file_path.display().to_string(),
855                name: format!("{name_prefix}[{window_idx}]"),
856                kind: "window".to_string(),
857                content_kind: ContentKind::from_extension(ext),
858                start_line,
859                end_line,
860                // Sliding-window chunks have no distinct identifier line.
861                symbol_line: start_line,
862                enriched_content: content.clone(),
863                content,
864                qualified_name: None,
865            });
866            window_idx += 1;
867        }
868
869        offset += step;
870    }
871
872    chunks
873}
874
875#[cfg(test)]
876mod tests {
877    use super::*;
878    use std::fmt::Write as _;
879    use std::path::Path;
880
881    #[test]
882    fn chunks_rust_functions_and_structs() {
883        let source = "fn hello() { println!(\"hi\"); }\nfn world() {}\nstruct Foo { x: i32 }";
884        let config = crate::languages::config_for_extension("rs").unwrap();
885        let chunks = chunk_file(
886            Path::new("test.rs"),
887            source,
888            &config,
889            &ChunkConfig::default(),
890        );
891        assert!(
892            chunks.len() >= 2,
893            "expected at least 2 chunks, got {}",
894            chunks.len()
895        );
896        assert!(chunks.iter().any(|c| c.name == "hello"));
897        assert!(chunks.iter().any(|c| c.name == "world"));
898    }
899
900    #[test]
901    fn chunks_python_functions_and_classes() {
902        let source = "def greet(name):\n    pass\n\nclass Foo:\n    pass\n";
903        let config = crate::languages::config_for_extension("py").unwrap();
904        let chunks = chunk_file(
905            Path::new("test.py"),
906            source,
907            &config,
908            &ChunkConfig::default(),
909        );
910        assert!(chunks.len() >= 2);
911        assert!(chunks.iter().any(|c| c.name == "greet"));
912        assert!(chunks.iter().any(|c| c.name == "Foo"));
913    }
914
915    #[test]
916    fn chunks_python_stub_functions_and_classes() {
917        let source = "from typing import Protocol\n\ndef greet(name: str) -> str: ...\n\nclass Foo(Protocol):\n    value: int\n";
918        let config = crate::languages::config_for_extension("pyi").unwrap();
919        let chunks = chunk_file(
920            Path::new("test.pyi"),
921            source,
922            &config,
923            &ChunkConfig::default(),
924        );
925        assert!(chunks.len() >= 2);
926        assert!(chunks.iter().any(|c| c.name == "greet"));
927        assert!(chunks.iter().any(|c| c.name == "Foo"));
928    }
929
930    #[test]
931    fn fallback_small_file_single_chunk() {
932        // With enriched queries, `let x = 42` matches variable_declarator.
933        // Use a source with NO tree-sitter captures to test the plaintext fallback.
934        let source = "// just a comment\n// and another\n";
935        let config = crate::languages::config_for_extension("js").unwrap();
936        let chunks = chunk_file(
937            Path::new("script.js"),
938            source,
939            &config,
940            &ChunkConfig::default(),
941        );
942        assert_eq!(chunks.len(), 1);
943        assert_eq!(chunks[0].kind, "file");
944    }
945
946    #[test]
947    fn fallback_large_file_produces_windows() {
948        // Create a file larger than default max_chunk_bytes with no function declarations
949        let line = "console.log('hello world, this is a long line of javascript code');\n";
950        let source: String = line.repeat(200); // ~13KB
951        let chunk_config = ChunkConfig::default();
952        assert!(source.len() > chunk_config.max_chunk_bytes);
953
954        let config = crate::languages::config_for_extension("js").unwrap();
955        let chunks = chunk_file(Path::new("big.js"), &source, &config, &chunk_config);
956        assert!(
957            chunks.len() > 1,
958            "expected multiple windows, got {}",
959            chunks.len()
960        );
961        assert!(chunks.iter().all(|c| c.kind == "window"));
962        assert!(chunks[0].name.contains("[0]"));
963    }
964
965    #[test]
966    fn large_definition_is_windowed() {
967        // A Rust function larger than default max_chunk_bytes
968        let mut source = String::from("fn big_function() {\n");
969        for i in 0..200 {
970            writeln!(source, "    let var_{i} = {i} * 2 + 1; // some computation").unwrap();
971        }
972        source.push_str("}\n");
973        let chunk_config = ChunkConfig::default();
974        assert!(source.len() > chunk_config.max_chunk_bytes);
975
976        let config = crate::languages::config_for_extension("rs").unwrap();
977        let chunks = chunk_file(Path::new("test.rs"), &source, &config, &chunk_config);
978        assert!(
979            chunks.len() > 1,
980            "expected windowed chunks, got {}",
981            chunks.len()
982        );
983        assert!(chunks[0].name.starts_with("big_function["));
984    }
985
986    #[test]
987    fn empty_file_produces_no_chunks() {
988        let config = crate::languages::config_for_extension("rs").unwrap();
989        let chunks = chunk_file(Path::new("empty.rs"), "", &config, &ChunkConfig::default());
990        assert!(chunks.is_empty());
991    }
992
993    // --- T1 enrichment tests ---
994
995    /// Helper: parse source with tree-sitter and return the first `@def` node.
996    fn first_def_node(
997        source: &str,
998        ext: &str,
999    ) -> (
1000        tree_sitter::Tree,
1001        std::sync::Arc<crate::languages::LangConfig>,
1002    ) {
1003        let config = crate::languages::config_for_extension(ext).unwrap();
1004        let mut parser = Parser::new();
1005        parser.set_language(&config.language).unwrap();
1006        let tree = parser.parse(source, None).unwrap();
1007        (tree, config)
1008    }
1009
1010    #[test]
1011    fn scope_chain_rust_impl_method() {
1012        let source = "impl Foo {\n    fn bar(&self) {}\n}";
1013        let (tree, config) = first_def_node(source, "rs");
1014        let mut cursor = QueryCursor::new();
1015        let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
1016
1017        let mut def_node = None;
1018        while let Some(m) = StreamingIterator::next(&mut matches) {
1019            for cap in m.captures {
1020                let cap_name = &config.query.capture_names()[cap.index as usize];
1021                if *cap_name == "def" {
1022                    def_node = Some(cap.node);
1023                }
1024            }
1025        }
1026        let node = def_node.expect("should find a @def node");
1027        let scope = build_scope_chain(node, source);
1028        assert!(
1029            scope.contains("impl_item"),
1030            "scope should contain impl_item, got: {scope}"
1031        );
1032        assert!(
1033            scope.contains("Foo"),
1034            "scope should contain 'Foo', got: {scope}"
1035        );
1036    }
1037
1038    #[test]
1039    fn scope_chain_python_class_method() {
1040        let source = "class Greeter:\n    def say_hello(self):\n        pass\n";
1041        let (tree, config) = first_def_node(source, "py");
1042        let mut cursor = QueryCursor::new();
1043        let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
1044
1045        // Find the function_definition @def (say_hello), not the class @def
1046        let mut fn_node = None;
1047        while let Some(m) = StreamingIterator::next(&mut matches) {
1048            for cap in m.captures {
1049                let cap_name = &config.query.capture_names()[cap.index as usize];
1050                if *cap_name == "def" && cap.node.kind() == "function_definition" {
1051                    fn_node = Some(cap.node);
1052                }
1053            }
1054        }
1055        let node = fn_node.expect("should find say_hello @def node");
1056        let scope = build_scope_chain(node, source);
1057        assert!(
1058            scope.contains("class_definition"),
1059            "scope should contain class_definition, got: {scope}"
1060        );
1061        assert!(
1062            scope.contains("Greeter"),
1063            "scope should contain 'Greeter', got: {scope}"
1064        );
1065    }
1066
1067    #[test]
1068    fn extract_signature_rust_function() {
1069        let source = "fn greet(name: &str) -> String { name.to_string() }";
1070        let (tree, config) = first_def_node(source, "rs");
1071        let mut cursor = QueryCursor::new();
1072        let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
1073
1074        let mut def_node = None;
1075        while let Some(m) = StreamingIterator::next(&mut matches) {
1076            for cap in m.captures {
1077                let cap_name = &config.query.capture_names()[cap.index as usize];
1078                if *cap_name == "def" {
1079                    def_node = Some(cap.node);
1080                }
1081            }
1082        }
1083        let node = def_node.expect("should find @def node");
1084        let sig = extract_signature(node, source).expect("should extract signature");
1085        assert!(
1086            sig.contains("greet"),
1087            "signature should contain 'greet', got: {sig}"
1088        );
1089        assert!(
1090            sig.contains("name: &str"),
1091            "signature should contain parameter, got: {sig}"
1092        );
1093        assert!(
1094            sig.contains("-> String"),
1095            "signature should contain return type, got: {sig}"
1096        );
1097    }
1098
1099    #[test]
1100    fn enriched_content_has_header() {
1101        let source = "fn hello() { println!(\"hi\"); }";
1102        let config = crate::languages::config_for_extension("rs").unwrap();
1103        let chunks = chunk_file(
1104            Path::new("src/main.rs"),
1105            source,
1106            &config,
1107            &ChunkConfig::default(),
1108        );
1109        assert!(!chunks.is_empty());
1110        let chunk = &chunks[0];
1111        assert!(
1112            chunk.enriched_content.starts_with("//"),
1113            "enriched_content should start with '//' header, got: {}",
1114            &chunk.enriched_content[..chunk.enriched_content.len().min(80)]
1115        );
1116        assert!(
1117            chunk.enriched_content.contains("src/main.rs"),
1118            "enriched_content should contain file path"
1119        );
1120        // Raw content should NOT have the header
1121        assert!(
1122            !chunk.content.starts_with("//"),
1123            "raw content should not start with header"
1124        );
1125    }
1126
1127    #[test]
1128    fn sliding_window_enriched_equals_content() {
1129        let source = "let x = 42;\nconsole.log(x);\n";
1130        let chunks = chunk_text(Path::new("test.txt"), source, &ChunkConfig::default());
1131        assert!(!chunks.is_empty());
1132        for chunk in &chunks {
1133            assert_eq!(
1134                chunk.enriched_content, chunk.content,
1135                "sliding window chunks should have enriched_content == content"
1136            );
1137        }
1138    }
1139
1140    #[test]
1141    fn chunks_rdf_xml_and_owl_elements_with_tree_sitter() {
1142        let source = r#"<?xml version="1.0"?>
1143<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
1144         xmlns:owl="http://www.w3.org/2002/07/owl#">
1145  <owl:Class rdf:about="http://example.com/Person"/>
1146  <owl:ObjectProperty rdf:about="http://example.com/knows"/>
1147</rdf:RDF>"#;
1148        let rdf_config = crate::languages::config_for_extension("rdf").unwrap();
1149        let owl_config = crate::languages::config_for_extension("owl").unwrap();
1150
1151        let rdf_chunks = chunk_file(
1152            Path::new("ontology.rdf"),
1153            source,
1154            &rdf_config,
1155            &ChunkConfig::default(),
1156        );
1157        let owl_chunks = chunk_file(
1158            Path::new("ontology.owl"),
1159            source,
1160            &owl_config,
1161            &ChunkConfig::default(),
1162        );
1163
1164        assert!(rdf_chunks.iter().any(|chunk| chunk.name == "owl:Class"));
1165        assert!(
1166            rdf_chunks
1167                .iter()
1168                .any(|chunk| chunk.name == "owl:ObjectProperty")
1169        );
1170        assert!(rdf_chunks.iter().all(|chunk| chunk.kind == "element"));
1171        assert!(owl_chunks.iter().any(|chunk| chunk.name == "owl:Class"));
1172    }
1173
1174    #[test]
1175    fn chunks_turtle_by_rdf_statement_blocks() {
1176        let source = r#"@prefix ex: <http://example.com/> .
1177@prefix owl: <http://www.w3.org/2002/07/owl#> .
1178
1179ex:Person
1180  a owl:Class ;
1181  ex:label "Person" .
1182
1183ex:knows
1184  a owl:ObjectProperty ;
1185  ex:domain ex:Person ;
1186  ex:range ex:Person .
1187"#;
1188
1189        let chunks = chunk_rdf_text(Path::new("ontology.ttl"), source, &ChunkConfig::default());
1190
1191        assert_eq!(chunks.len(), 3);
1192        assert_eq!(chunks[0].kind, "rdf_statements");
1193        assert_eq!(chunks[0].name, "@prefix");
1194        assert_eq!(chunks[1].name, "ex:Person");
1195        assert_eq!(chunks[2].name, "ex:knows");
1196    }
1197
1198    #[test]
1199    fn header_dropped_when_exceeding_max_bytes() {
1200        // Create a chunk that barely fits in max_chunk_bytes, so adding
1201        // a header would push it over the limit.
1202        let tiny_config = ChunkConfig {
1203            max_chunk_bytes: 60,
1204            window_size: 30,
1205            window_overlap: 10,
1206        };
1207        // Source is exactly at max_chunk_bytes — any header would exceed it
1208        let source = "fn f() { let x = 42; return x; }";
1209        assert!(source.len() <= tiny_config.max_chunk_bytes);
1210
1211        let config = crate::languages::config_for_extension("rs").unwrap();
1212        let chunks = chunk_file(
1213            Path::new("long/path/to/file.rs"),
1214            source,
1215            &config,
1216            &tiny_config,
1217        );
1218        assert!(!chunks.is_empty());
1219        let chunk = &chunks[0];
1220        // Header ("// long/path/to/file.rs | defines: ...") + minified content > 60 bytes.
1221        // So enriched_content should fall back to minified content (no header),
1222        // and raw content is preserved as-is.
1223        assert!(
1224            !chunk.enriched_content.starts_with("//"),
1225            "header should be dropped when it would exceed max_chunk_bytes"
1226        );
1227        assert_eq!(chunk.content, source, "raw content should be unchanged");
1228    }
1229
1230    #[test]
1231    fn minify_whitespace_normalizes_indent_and_strips_trailing() {
1232        // 8-space indent → 4-space (halved)
1233        let source = "fn foo() {\n        let x = 1;\n        let y = 2;\n}\n";
1234        let result = minify_whitespace(source);
1235        let lines: Vec<&str> = result.lines().collect();
1236        assert_eq!(
1237            lines[1], "    let x = 1;",
1238            "8-space indent should become 4-space"
1239        );
1240        assert_eq!(
1241            lines[2], "    let y = 2;",
1242            "8-space indent should become 4-space"
1243        );
1244
1245        // Trailing whitespace removed
1246        let with_trailing = "fn bar()   \n    return 1;   \n";
1247        let result2 = minify_whitespace(with_trailing);
1248        assert!(
1249            result2.lines().all(|l| !l.ends_with(' ')),
1250            "trailing whitespace should be stripped"
1251        );
1252
1253        // 3+ consecutive blank lines collapsed to 1
1254        let with_blanks = "a\n\n\n\nb\n";
1255        let result3 = minify_whitespace(with_blanks);
1256        // Should have at most 1 blank line between a and b
1257        let blank_runs: Vec<usize> = {
1258            let mut runs = Vec::new();
1259            let mut count = 0usize;
1260            for line in result3.lines() {
1261                if line.is_empty() {
1262                    count += 1;
1263                } else {
1264                    if count > 0 {
1265                        runs.push(count);
1266                    }
1267                    count = 0;
1268                }
1269            }
1270            runs
1271        };
1272        assert!(
1273            blank_runs.iter().all(|&n| n <= 1),
1274            "3+ blank lines should collapse to 1, got runs: {blank_runs:?}"
1275        );
1276    }
1277
1278    // -------------------------------------------------------------------------
1279    // C1: ContentKind classification tests
1280    // -------------------------------------------------------------------------
1281
1282    /// `test:chunk_content_kind_code_for_rust_file` — chunks from a `.rs` file
1283    /// carry `ContentKind::Code`.
1284    ///
1285    /// Behavior: trigger-fails-on-baseline-then-passes-post-fix.
1286    /// On the baseline (before the `content_kind` field existed) this test did
1287    /// not compile. Post-fix it passes.
1288    #[test]
1289    fn chunk_content_kind_code_for_rust_file() {
1290        let source = "fn hello() {}\n";
1291        let config = crate::languages::config_for_extension("rs").unwrap();
1292        let chunks = chunk_file(
1293            Path::new("src/lib.rs"),
1294            source,
1295            &config,
1296            &ChunkConfig::default(),
1297        );
1298        assert!(!chunks.is_empty(), "expected at least one chunk");
1299        assert!(
1300            chunks.iter().all(|c| c.content_kind == ContentKind::Code),
1301            "all chunks from a .rs file must have ContentKind::Code; got: {:?}",
1302            chunks.iter().map(|c| c.content_kind).collect::<Vec<_>>()
1303        );
1304    }
1305
1306    /// `test:chunk_content_kind_meta_for_json_file` — chunks from a `.json` file
1307    /// carry `ContentKind::Meta`.
1308    #[test]
1309    fn chunk_content_kind_meta_for_json_file() {
1310        let source = r#"{"key": "value", "answer": 42}"#;
1311        let config = crate::languages::config_for_extension("json").unwrap();
1312        let chunks = chunk_file(
1313            Path::new("data.json"),
1314            source,
1315            &config,
1316            &ChunkConfig::default(),
1317        );
1318        assert!(!chunks.is_empty(), "expected at least one chunk");
1319        assert!(
1320            chunks.iter().all(|c| c.content_kind == ContentKind::Meta),
1321            "all chunks from a .json file must have ContentKind::Meta; got: {:?}",
1322            chunks.iter().map(|c| c.content_kind).collect::<Vec<_>>()
1323        );
1324    }
1325
1326    /// `test:chunk_content_kind_docs_for_md_file` — chunks from a `.md` file
1327    /// carry `ContentKind::Docs`.
1328    #[test]
1329    fn chunk_content_kind_docs_for_md_file() {
1330        let source = "# Title\n\nSome prose content.\n";
1331        let config = crate::languages::config_for_extension("md").unwrap();
1332        let chunks = chunk_file(
1333            Path::new("README.md"),
1334            source,
1335            &config,
1336            &ChunkConfig::default(),
1337        );
1338        assert!(!chunks.is_empty(), "expected at least one chunk");
1339        assert!(
1340            chunks.iter().all(|c| c.content_kind == ContentKind::Docs),
1341            "all chunks from a .md file must have ContentKind::Docs; got: {:?}",
1342            chunks.iter().map(|c| c.content_kind).collect::<Vec<_>>()
1343        );
1344    }
1345
1346    /// `test:chunk_content_kind_meta_for_yaml_toml_xml` — chunks from `.yaml`,
1347    /// `.toml`, and `.xml` files all carry `ContentKind::Meta`.
1348    #[test]
1349    fn chunk_content_kind_meta_for_yaml_toml_xml() {
1350        // YAML
1351        let yaml_source = "key: value\nanother: 42\n";
1352        let yaml_config = crate::languages::config_for_extension("yaml").unwrap();
1353        let yaml_chunks = chunk_file(
1354            Path::new("config.yaml"),
1355            yaml_source,
1356            &yaml_config,
1357            &ChunkConfig::default(),
1358        );
1359        assert!(!yaml_chunks.is_empty(), "expected yaml chunks");
1360        assert!(
1361            yaml_chunks
1362                .iter()
1363                .all(|c| c.content_kind == ContentKind::Meta),
1364            "yaml chunks must be Meta; got: {:?}",
1365            yaml_chunks
1366                .iter()
1367                .map(|c| c.content_kind)
1368                .collect::<Vec<_>>()
1369        );
1370
1371        // TOML
1372        let toml_source = "[section]\nkey = \"value\"\n";
1373        let toml_config = crate::languages::config_for_extension("toml").unwrap();
1374        let toml_chunks = chunk_file(
1375            Path::new("Cargo.toml"),
1376            toml_source,
1377            &toml_config,
1378            &ChunkConfig::default(),
1379        );
1380        assert!(!toml_chunks.is_empty(), "expected toml chunks");
1381        assert!(
1382            toml_chunks
1383                .iter()
1384                .all(|c| c.content_kind == ContentKind::Meta),
1385            "toml chunks must be Meta; got: {:?}",
1386            toml_chunks
1387                .iter()
1388                .map(|c| c.content_kind)
1389                .collect::<Vec<_>>()
1390        );
1391
1392        // XML
1393        let xml_source = r#"<?xml version="1.0"?><root><item>hello</item></root>"#;
1394        let xml_config = crate::languages::config_for_extension("xml").unwrap();
1395        let xml_chunks = chunk_file(
1396            Path::new("data.xml"),
1397            xml_source,
1398            &xml_config,
1399            &ChunkConfig::default(),
1400        );
1401        assert!(!xml_chunks.is_empty(), "expected xml chunks");
1402        assert!(
1403            xml_chunks
1404                .iter()
1405                .all(|c| c.content_kind == ContentKind::Meta),
1406            "xml chunks must be Meta; got: {:?}",
1407            xml_chunks
1408                .iter()
1409                .map(|c| c.content_kind)
1410                .collect::<Vec<_>>()
1411        );
1412    }
1413
1414    /// `test:content_kind_from_extension_covers_code_docs_meta` — the
1415    /// `ContentKind::from_extension` classifier returns the right variant for
1416    /// at least one representative extension from each category.
1417    #[test]
1418    fn content_kind_from_extension_covers_code_docs_meta() {
1419        // Code
1420        for ext in [
1421            "rs", "py", "ts", "go", "java", "cpp", "sh", "rb", "kt", "swift", "scala",
1422        ] {
1423            assert_eq!(
1424                ContentKind::from_extension(ext),
1425                ContentKind::Code,
1426                ".{ext} should be Code"
1427            );
1428        }
1429        // Docs
1430        for ext in ["md", "rst", "txt", "adoc", "org"] {
1431            assert_eq!(
1432                ContentKind::from_extension(ext),
1433                ContentKind::Docs,
1434                ".{ext} should be Docs"
1435            );
1436        }
1437        // Meta
1438        for ext in [
1439            "json", "yaml", "yml", "toml", "xml", "lock", "snap", "csv", "tsv", "proto",
1440        ] {
1441            assert_eq!(
1442                ContentKind::from_extension(ext),
1443                ContentKind::Meta,
1444                ".{ext} should be Meta"
1445            );
1446        }
1447    }
1448
1449    /// `test:sliding_window_chunks_carry_content_kind` — the fallback
1450    /// sliding-window path (text files) also propagates `content_kind`.
1451    #[test]
1452    fn sliding_window_chunks_carry_content_kind() {
1453        // .txt (Docs) fallback — chunk_source_for_path calls chunk_text which
1454        // calls sliding_windows.
1455        let source = "just some plain text with no tree-sitter grammar support\n";
1456        let chunks = chunk_source_for_path(
1457            Path::new("notes.txt"),
1458            source,
1459            false,
1460            &ChunkConfig::default(),
1461        );
1462        assert!(!chunks.is_empty(), "expected at least one chunk");
1463        assert!(
1464            chunks.iter().all(|c| c.content_kind == ContentKind::Docs),
1465            "notes.txt chunks must be Docs; got: {:?}",
1466            chunks.iter().map(|c| c.content_kind).collect::<Vec<_>>()
1467        );
1468    }
1469}