Skip to main content

ripvec_core/
chunk.rs

1//! Tree-sitter based code chunking with sliding-window fallback.
2//!
3//! Parses source files into ASTs and extracts semantic chunks at
4//! function, class, and method boundaries. For files without recognized
5//! semantic structure (or very large fallback chunks), splits into
6//! overlapping sliding windows for uniform embedding sizes.
7
8use std::path::Path;
9use streaming_iterator::StreamingIterator;
10use tree_sitter::{Parser, QueryCursor};
11
12/// High-level content category of a chunked file.
13///
14/// Assigned at chunk-emission time based on the source file's extension.
15/// Used by downstream tools (`get_repo_map`, `find_duplicates`, `search`)
16/// to filter corpus slices without re-examining every chunk's content.
17///
18/// Serialized via `rkyv` and `bitcode` alongside [`CodeChunk`] so the
19/// value survives the in-memory index round-trip without re-derivation.
20///
21/// The default is [`ContentKind::Code`] so that test fixtures and
22/// pre-existing construction sites that don't yet specify a kind continue
23/// to behave as before (code-centric queries include them by default).
24#[derive(
25    Debug,
26    Clone,
27    Copy,
28    Default,
29    PartialEq,
30    Eq,
31    Hash,
32    rkyv::Archive,
33    rkyv::Serialize,
34    rkyv::Deserialize,
35    bitcode::Encode,
36    bitcode::Decode,
37)]
38pub enum ContentKind {
39    /// Source code files (`.rs`, `.py`, `.ts`, `.go`, etc.).
40    ///
41    /// This is the default variant so that `..Default::default()` in struct
42    /// update expressions produces `ContentKind::Code`.
43    #[default]
44    Code,
45    /// Human-readable documentation (`.md`, `.rst`, `.txt`, `.adoc`, `.org`).
46    Docs,
47    /// Machine-generated or structured data files (`.json`, `.yaml`, `.toml`,
48    /// `.xml`, `.lock`, `.snap`, `.csv`, `.tsv`, `.proto`).
49    Meta,
50}
51
52impl ContentKind {
53    /// Classify a file extension (without leading dot, case-insensitive) into
54    /// a [`ContentKind`].
55    ///
56    /// Unknown extensions default to [`ContentKind::Code`] so that novel
57    /// source files are included in code search paths rather than silently
58    /// filtered.
59    #[must_use]
60    pub fn from_extension(ext: &str) -> Self {
61        match ext.to_ascii_lowercase().as_str() {
62            // -- Docs --
63            "md" | "rst" | "txt" | "adoc" | "asciidoc" | "org" => Self::Docs,
64            // -- Meta (structured data / generated files) --
65            // HCL: .tfvars carries environment variable values (per-env
66            // settings), not application code. Classifying as Meta makes
67            // find_duplicates(include_metadata=false) drop the noisy
68            // F1 Environment Fan-Out clusters by default (R4, Wave 3).
69            "json" | "yaml" | "yml" | "toml" | "xml" | "lock" | "snap" | "csv" | "tsv"
70            | "proto" | "rdf" | "owl" | "tfvars" => Self::Meta,
71            // -- Code (all recognised source extensions, plus unknown = code) --
72            _ => Self::Code,
73        }
74    }
75}
76
77/// Runtime configuration for the chunking pipeline.
78///
79/// All size values are in bytes. Pass to [`chunk_file`] to control
80/// chunk sizing without recompilation.
81#[derive(Debug, Clone)]
82pub struct ChunkConfig {
83    /// Maximum chunk content length in bytes before splitting into windows.
84    /// Chunks larger than this are split even if tree-sitter found them as
85    /// a single definition (e.g., a 500-line function).
86    pub max_chunk_bytes: usize,
87    /// Target window size in bytes for the sliding-window fallback chunker.
88    /// ~2KB of source text ≈ 128-256 tokens after BPE, well within the
89    /// 512-token model limit and optimal for embedding quality.
90    pub window_size: usize,
91    /// Overlap between adjacent windows in bytes.
92    /// Ensures definitions spanning a window boundary are captured in at
93    /// least one window. Defaults to 25% of `window_size`.
94    pub window_overlap: usize,
95}
96
97impl Default for ChunkConfig {
98    fn default() -> Self {
99        Self {
100            max_chunk_bytes: 4096,
101            window_size: 2048,
102            window_overlap: 512,
103        }
104    }
105}
106
107/// A semantic chunk extracted from a source file.
108#[derive(
109    Debug,
110    Clone,
111    rkyv::Archive,
112    rkyv::Serialize,
113    rkyv::Deserialize,
114    bitcode::Encode,
115    bitcode::Decode,
116)]
117pub struct CodeChunk {
118    /// Path to the source file.
119    pub file_path: String,
120    /// Name of the definition (function name, class name, etc.).
121    pub name: String,
122    /// Kind of syntax node (e.g., `function_item`, `class_definition`).
123    /// For chunks emitted from the `dense.rs` semble path, this is the
124    /// LSP SymbolKind numeric value serialized as a string (e.g., `"12"` for
125    /// Function). For chunks emitted from `chunk.rs`'s query-based path, this
126    /// is the raw tree-sitter node kind string. Downstream consumers should
127    /// call [`crate::languages::lsp_symbol_kind_for_node_kind`] to normalize.
128    pub kind: String,
129    /// High-level content category derived from the file extension.
130    ///
131    /// Set by the chunker at emission time. Downstream filters (e.g.,
132    /// `find_duplicates`, `get_repo_map`) use this to exclude
133    /// [`ContentKind::Meta`] files from code-centric queries by default.
134    pub content_kind: ContentKind,
135    /// 1-based start line number (first line of the chunk, which may be a
136    /// doc-comment line preceding the symbol's identifier).
137    pub start_line: usize,
138    /// 1-based end line number.
139    pub end_line: usize,
140    /// 1-based line number of the symbol's name-identifier token within this
141    /// chunk. For tree-sitter query-based chunks (`chunk_file`), this is the
142    /// exact line where the `@name` capture appears, so LSP tools that jump to
143    /// `lsp_location.start_line` land on the identifier rather than on a
144    /// preceding doc-comment line.
145    ///
146    /// Defaults to `start_line` for chunks produced by the sliding-window
147    /// fallback, the semble/dense path, and RDF-block paths, where the
148    /// name-identifier line is not separately tracked.
149    ///
150    /// Convention matches `start_line`: 1-based, relative to the file's first
151    /// line.
152    pub symbol_line: usize,
153    /// Source text of the chunk (raw code for display).
154    pub content: String,
155    /// Enriched content with scope chain and signature metadata for embedding.
156    /// Falls back to `content` if metadata would exceed chunk size limits.
157    pub enriched_content: String,
158    /// Optional language-specific composite identifier that disambiguates a
159    /// symbol whose bare `name` is shared across many declarations.
160    ///
161    /// For HCL: `resource "aws_iam_role" "loader" { ... }` ships
162    /// `name = "loader"` and `qualified_name = Some("aws_iam_role.loader")`.
163    /// Each `local.X = ...` extracted as its own chunk ships
164    /// `qualified_name = Some("local.X")`. Workspace symbol consumers
165    /// surface `qualified_name` when set so a query for the composite form
166    /// finds the right resource on corpora that reuse names like `loader`
167    /// across many resource types (R1, R6, Wave 3).
168    ///
169    /// `None` for all non-HCL chunks today; reserved for future
170    /// language-specific composite naming.
171    pub qualified_name: Option<String>,
172}
173
174/// Walk up the AST parent chain collecting structural container names.
175///
176/// Produces a scope chain like `"impl_item Foo > fn forward"` by
177/// identifying structural containers (impl blocks, classes, modules, namespaces)
178/// and extracting their names. Tries the `name` field first, then `type`
179/// (for Rust `impl_item` which uses `type` instead of `name`).
180#[must_use]
181pub fn build_scope_chain(node: tree_sitter::Node<'_>, source: &str) -> String {
182    /// Node kinds that represent structural containers, by language.
183    const CONTAINER_KINDS: &[&str] = &[
184        // Rust
185        "impl_item",
186        "trait_item",
187        "mod_item",
188        // Python
189        "class_definition",
190        "module",
191        // JS/TS
192        "class_declaration",
193        // Java
194        // "class_declaration" already covered above
195        // Go
196        "type_declaration",
197        // C++
198        "namespace_definition",
199        "class_specifier",
200    ];
201
202    /// Field names to try when extracting the container name.
203    /// `impl_item` uses `type` instead of `name`; Go `type_declaration`
204    /// has no fields, so we fall back to the node kind.
205    const NAME_FIELDS: &[&str] = &["name", "type"];
206
207    let mut parts = Vec::new();
208    let mut current = node.parent();
209    while let Some(parent) = current {
210        let kind = parent.kind();
211        if CONTAINER_KINDS.contains(&kind) {
212            let name = NAME_FIELDS
213                .iter()
214                .find_map(|field| parent.child_by_field_name(field))
215                .map_or(kind, |n| &source[n.start_byte()..n.end_byte()]);
216            parts.push(format!("{kind} {name}"));
217        }
218        current = parent.parent();
219    }
220    parts.reverse();
221    parts.join(" > ")
222}
223
224/// Extract the function/method signature from a definition node.
225///
226/// Returns the text from the function name to the start of the body,
227/// which captures the parameter list and return type (if any).
228/// Returns `None` if the node has no `name` or `body`/`block` field.
229#[must_use]
230pub fn extract_signature(node: tree_sitter::Node<'_>, source: &str) -> Option<String> {
231    let name_node = node.child_by_field_name("name")?;
232    let body_node = node
233        .child_by_field_name("body")
234        .or_else(|| node.child_by_field_name("block"))?;
235    let start = name_node.start_byte();
236    let end = body_node.start_byte();
237    if start >= end {
238        return None;
239    }
240    let sig = source[start..end].trim();
241    if sig.is_empty() {
242        None
243    } else {
244        Some(sig.to_string())
245    }
246}
247
248/// Reduce indentation waste for embedding by normalizing whitespace.
249///
250/// For each line:
251/// - Counts leading spaces/tabs, normalises to 2 spaces per indent level
252///   (4 spaces → 2, 8 spaces → 4, 1 tab → 2 spaces).
253/// - Strips trailing whitespace.
254///
255/// Additionally, 3 or more consecutive blank lines are collapsed to a single
256/// blank line. This reduces the number of whitespace tokens consumed in the
257/// 512-token embedding window without altering visible structure.
258#[must_use]
259pub fn minify_whitespace(source: &str) -> String {
260    let mut result = String::with_capacity(source.len());
261    let mut consecutive_blank = 0usize;
262
263    for line in source.lines() {
264        // Count leading whitespace and determine indent level
265        let leading = line
266            .chars()
267            .take_while(|c| *c == ' ' || *c == '\t')
268            .fold(0usize, |acc, c| acc + if c == '\t' { 2 } else { 1 });
269        let rest = line.trim_start();
270
271        if rest.is_empty() {
272            // Blank line handling: collapse 3+ consecutive blanks to 1.
273            // Only emit the first blank line of a run; suppress the rest.
274            consecutive_blank += 1;
275            if consecutive_blank == 1 {
276                result.push('\n');
277            }
278        } else {
279            consecutive_blank = 0;
280            // Normalise: every 2 spaces of original indent → 1 space of output
281            // (round up so indent level 1 → 1 space, level 2 → 2, etc.)
282            let indent_level = leading.div_ceil(2);
283            for _ in 0..indent_level {
284                result.push(' ');
285            }
286            result.push_str(rest.trim_end());
287            result.push('\n');
288        }
289    }
290
291    // Remove trailing newline added for the last line if source didn't end with one
292    if !source.ends_with('\n') && result.ends_with('\n') {
293        result.pop();
294    }
295
296    result
297}
298
299/// Build the enriched content header for a code chunk.
300///
301/// Prepends scope chain and signature metadata as a comment line.
302/// If the header + content would exceed `max_bytes`, returns `content` unchanged.
303fn build_enriched_content(
304    path: &Path,
305    node: tree_sitter::Node<'_>,
306    source: &str,
307    content: &str,
308    max_bytes: usize,
309) -> String {
310    let scope = build_scope_chain(node, source);
311    let sig = extract_signature(node, source).unwrap_or_default();
312    let rel_path = path.display().to_string();
313
314    let header = if scope.is_empty() && sig.is_empty() {
315        format!("// {rel_path}\n")
316    } else if scope.is_empty() {
317        format!("// {rel_path} | defines: {sig}\n")
318    } else if sig.is_empty() {
319        format!("// {rel_path} | {scope}\n")
320    } else {
321        format!("// {rel_path} | {scope} | defines: {sig}\n")
322    };
323
324    // Minify whitespace for the embedding content to reduce token waste.
325    // The raw `content` field is kept as-is for display.
326    let minified = minify_whitespace(content);
327
328    if header.len() + minified.len() > max_bytes {
329        minified
330    } else {
331        format!("{header}{minified}")
332    }
333}
334
335/// Extract semantic chunks from a source file.
336///
337/// Uses tree-sitter to parse the file and extract definitions matching
338/// the language's query patterns. For files with no semantic matches,
339/// falls back to overlapping sliding windows. Large individual chunks
340/// are also split into windows.
341///
342/// Pass a [`ChunkConfig`] to control chunk sizing at runtime.
343#[must_use]
344pub fn chunk_file(
345    path: &Path,
346    source: &str,
347    config: &crate::languages::LangConfig,
348    chunk_config: &ChunkConfig,
349) -> Vec<CodeChunk> {
350    let mut parser = Parser::new();
351    if parser.set_language(&config.language).is_err() {
352        return sliding_windows(path, source, chunk_config);
353    }
354
355    let Some(tree) = parser.parse(source, None) else {
356        return sliding_windows(path, source, chunk_config);
357    };
358
359    let mut cursor = QueryCursor::new();
360    let mut chunks = Vec::new();
361    let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
362
363    while let Some(m) = matches.next() {
364        let mut name = String::new();
365        // 1-based line of the `@name` capture node (identifier line).
366        // Set when the `@name` capture is found; falls back to def-node start_line.
367        let mut name_line: Option<usize> = None;
368        let mut def_node = None;
369        for cap in m.captures {
370            let cap_name = &config.query.capture_names()[cap.index as usize];
371            if *cap_name == "name" {
372                name = source[cap.node.start_byte()..cap.node.end_byte()].to_string();
373                // tree-sitter row is 0-based; convert to 1-based to match start_line.
374                name_line = Some(cap.node.start_position().row + 1);
375            } else if *cap_name == "def" {
376                def_node = Some(cap.node);
377            }
378        }
379        if let Some(node) = def_node {
380            let content = &source[node.start_byte()..node.end_byte()];
381            let start_line = node.start_position().row + 1;
382            // `symbol_line` is the identifier's line; falls back to `start_line`
383            // when the `@name` capture was not found (should not happen for
384            // well-formed queries, but defended here for robustness).
385            let symbol_line = name_line.unwrap_or(start_line);
386
387            // HCL: extract `type.name` composite for `block` nodes via the
388            // shared derive_hcl_block_name helper, and expand `locals { ... }`
389            // blocks into one chunk per attribute (R1 + R6, Wave 3).
390            let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
391            let is_hcl = matches!(ext.to_ascii_lowercase().as_str(), "tf" | "tfvars" | "hcl");
392            let qualified_name = if is_hcl && node.kind() == "block" {
393                let composite = crate::languages::derive_hcl_block_name(&node, source.as_bytes());
394                if composite.is_empty() || composite == name {
395                    None
396                } else {
397                    Some(composite)
398                }
399            } else {
400                None
401            };
402
403            // R6: expand `locals { ... }` into per-attribute chunks. This
404            // produces additional chunks (one per local) but also keeps the
405            // umbrella `locals` chunk so consumers that want the block as
406            // a whole still see it.
407            if is_hcl && node.kind() == "block" && name == "locals" {
408                chunks.extend(emit_hcl_local_attribute_chunks(
409                    path,
410                    source,
411                    node,
412                    chunk_config,
413                ));
414            }
415
416            // Split oversized chunks into windows
417            if content.len() > chunk_config.max_chunk_bytes {
418                chunks.extend(sliding_windows_with_name(
419                    path,
420                    content,
421                    &name,
422                    start_line,
423                    chunk_config,
424                ));
425            } else {
426                let enriched = build_enriched_content(
427                    path,
428                    node,
429                    source,
430                    content,
431                    chunk_config.max_chunk_bytes,
432                );
433                chunks.push(CodeChunk {
434                    file_path: path.display().to_string(),
435                    name,
436                    kind: node.kind().to_string(),
437                    content_kind: ContentKind::from_extension(ext),
438                    start_line,
439                    end_line: node.end_position().row + 1,
440                    symbol_line,
441                    enriched_content: enriched,
442                    content: content.to_string(),
443                    qualified_name,
444                });
445            }
446        }
447    }
448
449    // Fallback: sliding windows if no semantic matches
450    if chunks.is_empty() && !source.trim().is_empty() {
451        return sliding_windows(path, source, chunk_config);
452    }
453
454    chunks
455}
456
457/// Walk a HCL `locals { ... }` block and emit one [`CodeChunk`] per
458/// `attribute` (`local.X = value`).
459///
460/// Each emitted chunk has `kind = "local_attribute"` (which maps to
461/// LSP `SymbolKind::Constant` (14) via
462/// [`crate::languages::lsp_symbol_kind_for_node_kind`]), `name = X`,
463/// `qualified_name = Some("local.X")`, and `content_kind` derived from
464/// the file extension. T18 Cohesion Refraction queries can now distinguish
465/// individual locals (R6, Wave 3).
466///
467/// Only called from [`chunk_file`] when the chunker sees a `block` node
468/// whose `@name` capture is `"locals"`. Non-attribute body children
469/// (whitespace, comments) are skipped.
470fn emit_hcl_local_attribute_chunks(
471    path: &Path,
472    source: &str,
473    locals_block: tree_sitter::Node<'_>,
474    chunk_config: &ChunkConfig,
475) -> Vec<CodeChunk> {
476    let mut out = Vec::new();
477    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
478    let content_kind = ContentKind::from_extension(ext);
479
480    // Walk children: identifier, block_start, body, block_end. The body
481    // contains the `attribute` children we care about.
482    let mut cursor = locals_block.walk();
483    for child in locals_block.children(&mut cursor) {
484        if child.kind() != "body" {
485            continue;
486        }
487        let mut body_cursor = child.walk();
488        for attr in child.children(&mut body_cursor) {
489            if attr.kind() != "attribute" {
490                continue;
491            }
492            // Find the identifier (local name) and stop at the first.
493            let mut name_node: Option<tree_sitter::Node<'_>> = None;
494            let mut attr_cursor = attr.walk();
495            for grandchild in attr.children(&mut attr_cursor) {
496                if grandchild.kind() == "identifier" {
497                    name_node = Some(grandchild);
498                    break;
499                }
500            }
501            let Some(id_node) = name_node else {
502                continue;
503            };
504            let name_text = source[id_node.start_byte()..id_node.end_byte()].to_string();
505            let content_text = source[attr.start_byte()..attr.end_byte()].to_string();
506            let start_line = attr.start_position().row + 1;
507            let end_line = attr.end_position().row + 1;
508            let symbol_line = id_node.start_position().row + 1;
509            let composite = format!("local.{name_text}");
510            let header = format!("// {} | local: {composite}\n", path.display());
511            let enriched = if header.len() + content_text.len() <= chunk_config.max_chunk_bytes {
512                format!("{header}{content_text}")
513            } else {
514                content_text.clone()
515            };
516            out.push(CodeChunk {
517                file_path: path.display().to_string(),
518                name: name_text,
519                kind: "local_attribute".to_string(),
520                content_kind,
521                start_line,
522                end_line,
523                symbol_line,
524                enriched_content: enriched,
525                content: content_text,
526                qualified_name: Some(composite),
527            });
528        }
529    }
530    out
531}
532
533/// Split source text into overlapping sliding windows.
534///
535/// Each window is `chunk_config.window_size` bytes with `chunk_config.window_overlap` bytes of
536/// overlap. Window boundaries are adjusted to line breaks to avoid
537/// splitting mid-line.
538///
539/// This is used as the fallback for files without tree-sitter support
540/// (plain text, unknown extensions) and for large semantic chunks that
541/// exceed `max_chunk_bytes`.
542#[must_use]
543pub fn chunk_text(path: &Path, source: &str, chunk_config: &ChunkConfig) -> Vec<CodeChunk> {
544    sliding_windows(path, source, chunk_config)
545}
546
547/// Return true for RDF-family text formats without a stable Rust tree-sitter grammar.
548#[must_use]
549pub fn is_rdf_text_extension(ext: &str) -> bool {
550    matches!(
551        ext.to_ascii_lowercase().as_str(),
552        "ttl" | "nt" | "n3" | "trig" | "nq"
553    )
554}
555
556/// Chunk Turtle/N-Triples/TriG/N-Quads style RDF by statement blocks.
557///
558/// RDF text formats are denser than prose but often lack a mature packaged
559/// tree-sitter grammar. This keeps prefixes together and groups multi-line
560/// subject statements ending in `.` so ontology classes and predicates remain
561/// intact for embedding.
562#[must_use]
563pub fn chunk_rdf_text(path: &Path, source: &str, chunk_config: &ChunkConfig) -> Vec<CodeChunk> {
564    if source.trim().is_empty() {
565        return vec![];
566    }
567
568    let mut chunks = Vec::new();
569    let mut current = String::new();
570    let mut current_start_line = 1usize;
571    let mut current_is_directive = false;
572
573    for (line_idx, line) in source.lines().enumerate() {
574        let line_no = line_idx + 1;
575        let trimmed = line.trim();
576        if trimmed.is_empty() {
577            flush_rdf_block(
578                path,
579                &current,
580                current_start_line,
581                chunk_config,
582                &mut chunks,
583            );
584            current.clear();
585            current_is_directive = false;
586            continue;
587        }
588
589        let line_is_directive = is_rdf_directive(trimmed);
590        if !current.is_empty() && current_is_directive && !line_is_directive {
591            flush_rdf_block(
592                path,
593                &current,
594                current_start_line,
595                chunk_config,
596                &mut chunks,
597            );
598            current.clear();
599            current_is_directive = false;
600        }
601
602        if current.is_empty() {
603            current_start_line = line_no;
604            current_is_directive = line_is_directive;
605        }
606        current.push_str(line);
607        current.push('\n');
608
609        if !current_is_directive && trimmed.ends_with('.') {
610            flush_rdf_block(
611                path,
612                &current,
613                current_start_line,
614                chunk_config,
615                &mut chunks,
616            );
617            current.clear();
618            current_is_directive = false;
619        }
620    }
621
622    flush_rdf_block(
623        path,
624        &current,
625        current_start_line,
626        chunk_config,
627        &mut chunks,
628    );
629    if chunks.is_empty() {
630        sliding_windows(path, source, chunk_config)
631    } else {
632        chunks
633    }
634}
635
636/// Chunk a source file according to its path extension.
637#[must_use]
638pub fn chunk_source_for_path(
639    path: &Path,
640    source: &str,
641    text_mode: bool,
642    chunk_config: &ChunkConfig,
643) -> Vec<CodeChunk> {
644    if text_mode {
645        return chunk_text(path, source, chunk_config);
646    }
647
648    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
649    if let Some(lang_config) = crate::languages::config_for_extension(ext) {
650        chunk_file(path, source, &lang_config, chunk_config)
651    } else if is_rdf_text_extension(ext) {
652        chunk_rdf_text(path, source, chunk_config)
653    } else {
654        chunk_text(path, source, chunk_config)
655    }
656}
657
658fn is_rdf_directive(trimmed: &str) -> bool {
659    trimmed.starts_with("@prefix")
660        || trimmed.starts_with("@base")
661        || trimmed.starts_with("PREFIX")
662        || trimmed.starts_with("BASE")
663}
664
665fn flush_rdf_block(
666    path: &Path,
667    content: &str,
668    start_line: usize,
669    chunk_config: &ChunkConfig,
670    chunks: &mut Vec<CodeChunk>,
671) {
672    let trimmed = content.trim();
673    if trimmed.is_empty() {
674        return;
675    }
676    let name = rdf_block_name(trimmed, path);
677    let content = format!("{trimmed}\n");
678    if content.len() > chunk_config.max_chunk_bytes {
679        chunks.extend(sliding_window_chunks(
680            &content,
681            path,
682            &name,
683            start_line,
684            chunk_config,
685        ));
686        return;
687    }
688    let header = format!("# {} | rdf: {name}\n", path.display());
689    let enriched_content = if header.len() + content.len() <= chunk_config.max_chunk_bytes {
690        format!("{header}{content}")
691    } else {
692        content.clone()
693    };
694    let line_count = content.lines().count().max(1);
695    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
696    chunks.push(CodeChunk {
697        file_path: path.display().to_string(),
698        name,
699        kind: "rdf_statements".to_string(),
700        content_kind: ContentKind::from_extension(ext),
701        start_line,
702        end_line: start_line + line_count - 1,
703        // RDF chunks have no name-identifier line distinct from the chunk start.
704        symbol_line: start_line,
705        enriched_content,
706        content,
707        qualified_name: None,
708    });
709}
710
711fn rdf_block_name(content: &str, path: &Path) -> String {
712    let first = content
713        .lines()
714        .map(str::trim)
715        .find(|line| !line.is_empty() && !line.starts_with('#'));
716    let Some(first) = first else {
717        return path
718            .file_name()
719            .unwrap_or_default()
720            .to_string_lossy()
721            .to_string();
722    };
723
724    if first.starts_with("@prefix") || first.starts_with("PREFIX") {
725        return "@prefix".to_string();
726    }
727    if first.starts_with("@base") || first.starts_with("BASE") {
728        return "@base".to_string();
729    }
730
731    let token = first
732        .split_whitespace()
733        .next()
734        .unwrap_or("")
735        .trim_end_matches([';', ',', '.']);
736    if token.is_empty() {
737        path.file_name()
738            .unwrap_or_default()
739            .to_string_lossy()
740            .to_string()
741    } else {
742        token.to_string()
743    }
744}
745
746/// Internal sliding-window implementation.
747fn sliding_windows(path: &Path, source: &str, chunk_config: &ChunkConfig) -> Vec<CodeChunk> {
748    if source.trim().is_empty() {
749        return vec![];
750    }
751
752    // Small enough for a single chunk
753    if source.len() <= chunk_config.max_chunk_bytes {
754        let content = source.to_string();
755        let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
756        return vec![CodeChunk {
757            file_path: path.display().to_string(),
758            name: path
759                .file_name()
760                .unwrap_or_default()
761                .to_string_lossy()
762                .to_string(),
763            kind: "file".to_string(),
764            content_kind: ContentKind::from_extension(ext),
765            start_line: 1,
766            end_line: source.lines().count(),
767            // Whole-file chunks have no distinct identifier line.
768            symbol_line: 1,
769            enriched_content: content.clone(),
770            content,
771            qualified_name: None,
772        }];
773    }
774
775    let file_name = path
776        .file_name()
777        .unwrap_or_default()
778        .to_string_lossy()
779        .to_string();
780    sliding_window_chunks(source, path, &file_name, 1, chunk_config)
781}
782
783/// Split a named definition into overlapping windows.
784///
785/// Used when a single tree-sitter match (e.g., a large function) exceeds
786/// `chunk_config.max_chunk_bytes`. Windows carry the definition name for search context.
787fn sliding_windows_with_name(
788    path: &Path,
789    content: &str,
790    name: &str,
791    base_line: usize,
792    chunk_config: &ChunkConfig,
793) -> Vec<CodeChunk> {
794    sliding_window_chunks(content, path, name, base_line, chunk_config)
795}
796
797/// Shared sliding-window loop used by both [`sliding_windows`] and
798/// [`sliding_windows_with_name`].
799///
800/// Splits `source` into overlapping windows of `chunk_config.window_size` bytes,
801/// snapping boundaries to line breaks. Each chunk is tagged with `name_prefix`
802/// and an index suffix (e.g., `"main[0]"`, `"main[1]"`).
803fn sliding_window_chunks(
804    source: &str,
805    file_path: &Path,
806    name_prefix: &str,
807    base_line: usize,
808    chunk_config: &ChunkConfig,
809) -> Vec<CodeChunk> {
810    let step = chunk_config
811        .window_size
812        .saturating_sub(chunk_config.window_overlap)
813        .max(1);
814    let bytes = source.as_bytes();
815    let mut chunks = Vec::new();
816    let mut offset = 0;
817    let mut window_idx = 0;
818
819    while offset < bytes.len() {
820        let raw_end = (offset + chunk_config.window_size).min(bytes.len());
821
822        // Snap end to a line boundary (don't split mid-line)
823        let end = if raw_end < bytes.len() {
824            match bytes[offset..raw_end].iter().rposition(|&b| b == b'\n') {
825                Some(pos) => offset + pos + 1,
826                None => raw_end, // no newline found, use raw end
827            }
828        } else {
829            raw_end
830        };
831
832        // Extract window as str (skip invalid UTF-8)
833        if let Ok(window) = std::str::from_utf8(&bytes[offset..end])
834            && !window.trim().is_empty()
835        {
836            let start_line = base_line + source[..offset].matches('\n').count();
837            let content_lines = window.lines().count().max(1);
838            let end_line = start_line + content_lines - 1;
839            let content = window.to_string();
840            let ext = file_path.extension().and_then(|e| e.to_str()).unwrap_or("");
841            chunks.push(CodeChunk {
842                file_path: file_path.display().to_string(),
843                name: format!("{name_prefix}[{window_idx}]"),
844                kind: "window".to_string(),
845                content_kind: ContentKind::from_extension(ext),
846                start_line,
847                end_line,
848                // Sliding-window chunks have no distinct identifier line.
849                symbol_line: start_line,
850                enriched_content: content.clone(),
851                content,
852                qualified_name: None,
853            });
854            window_idx += 1;
855        }
856
857        offset += step;
858    }
859
860    chunks
861}
862
863#[cfg(test)]
864mod tests {
865    use super::*;
866    use std::fmt::Write as _;
867    use std::path::Path;
868
869    #[test]
870    fn chunks_rust_functions_and_structs() {
871        let source = "fn hello() { println!(\"hi\"); }\nfn world() {}\nstruct Foo { x: i32 }";
872        let config = crate::languages::config_for_extension("rs").unwrap();
873        let chunks = chunk_file(
874            Path::new("test.rs"),
875            source,
876            &config,
877            &ChunkConfig::default(),
878        );
879        assert!(
880            chunks.len() >= 2,
881            "expected at least 2 chunks, got {}",
882            chunks.len()
883        );
884        assert!(chunks.iter().any(|c| c.name == "hello"));
885        assert!(chunks.iter().any(|c| c.name == "world"));
886    }
887
888    #[test]
889    fn chunks_python_functions_and_classes() {
890        let source = "def greet(name):\n    pass\n\nclass Foo:\n    pass\n";
891        let config = crate::languages::config_for_extension("py").unwrap();
892        let chunks = chunk_file(
893            Path::new("test.py"),
894            source,
895            &config,
896            &ChunkConfig::default(),
897        );
898        assert!(chunks.len() >= 2);
899        assert!(chunks.iter().any(|c| c.name == "greet"));
900        assert!(chunks.iter().any(|c| c.name == "Foo"));
901    }
902
903    #[test]
904    fn chunks_python_stub_functions_and_classes() {
905        let source = "from typing import Protocol\n\ndef greet(name: str) -> str: ...\n\nclass Foo(Protocol):\n    value: int\n";
906        let config = crate::languages::config_for_extension("pyi").unwrap();
907        let chunks = chunk_file(
908            Path::new("test.pyi"),
909            source,
910            &config,
911            &ChunkConfig::default(),
912        );
913        assert!(chunks.len() >= 2);
914        assert!(chunks.iter().any(|c| c.name == "greet"));
915        assert!(chunks.iter().any(|c| c.name == "Foo"));
916    }
917
918    #[test]
919    fn fallback_small_file_single_chunk() {
920        // With enriched queries, `let x = 42` matches variable_declarator.
921        // Use a source with NO tree-sitter captures to test the plaintext fallback.
922        let source = "// just a comment\n// and another\n";
923        let config = crate::languages::config_for_extension("js").unwrap();
924        let chunks = chunk_file(
925            Path::new("script.js"),
926            source,
927            &config,
928            &ChunkConfig::default(),
929        );
930        assert_eq!(chunks.len(), 1);
931        assert_eq!(chunks[0].kind, "file");
932    }
933
934    #[test]
935    fn fallback_large_file_produces_windows() {
936        // Create a file larger than default max_chunk_bytes with no function declarations
937        let line = "console.log('hello world, this is a long line of javascript code');\n";
938        let source: String = line.repeat(200); // ~13KB
939        let chunk_config = ChunkConfig::default();
940        assert!(source.len() > chunk_config.max_chunk_bytes);
941
942        let config = crate::languages::config_for_extension("js").unwrap();
943        let chunks = chunk_file(Path::new("big.js"), &source, &config, &chunk_config);
944        assert!(
945            chunks.len() > 1,
946            "expected multiple windows, got {}",
947            chunks.len()
948        );
949        assert!(chunks.iter().all(|c| c.kind == "window"));
950        assert!(chunks[0].name.contains("[0]"));
951    }
952
953    #[test]
954    fn large_definition_is_windowed() {
955        // A Rust function larger than default max_chunk_bytes
956        let mut source = String::from("fn big_function() {\n");
957        for i in 0..200 {
958            writeln!(source, "    let var_{i} = {i} * 2 + 1; // some computation").unwrap();
959        }
960        source.push_str("}\n");
961        let chunk_config = ChunkConfig::default();
962        assert!(source.len() > chunk_config.max_chunk_bytes);
963
964        let config = crate::languages::config_for_extension("rs").unwrap();
965        let chunks = chunk_file(Path::new("test.rs"), &source, &config, &chunk_config);
966        assert!(
967            chunks.len() > 1,
968            "expected windowed chunks, got {}",
969            chunks.len()
970        );
971        assert!(chunks[0].name.starts_with("big_function["));
972    }
973
974    #[test]
975    fn empty_file_produces_no_chunks() {
976        let config = crate::languages::config_for_extension("rs").unwrap();
977        let chunks = chunk_file(Path::new("empty.rs"), "", &config, &ChunkConfig::default());
978        assert!(chunks.is_empty());
979    }
980
981    // --- T1 enrichment tests ---
982
983    /// Helper: parse source with tree-sitter and return the first `@def` node.
984    fn first_def_node(
985        source: &str,
986        ext: &str,
987    ) -> (
988        tree_sitter::Tree,
989        std::sync::Arc<crate::languages::LangConfig>,
990    ) {
991        let config = crate::languages::config_for_extension(ext).unwrap();
992        let mut parser = Parser::new();
993        parser.set_language(&config.language).unwrap();
994        let tree = parser.parse(source, None).unwrap();
995        (tree, config)
996    }
997
998    #[test]
999    fn scope_chain_rust_impl_method() {
1000        let source = "impl Foo {\n    fn bar(&self) {}\n}";
1001        let (tree, config) = first_def_node(source, "rs");
1002        let mut cursor = QueryCursor::new();
1003        let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
1004
1005        let mut def_node = None;
1006        while let Some(m) = StreamingIterator::next(&mut matches) {
1007            for cap in m.captures {
1008                let cap_name = &config.query.capture_names()[cap.index as usize];
1009                if *cap_name == "def" {
1010                    def_node = Some(cap.node);
1011                }
1012            }
1013        }
1014        let node = def_node.expect("should find a @def node");
1015        let scope = build_scope_chain(node, source);
1016        assert!(
1017            scope.contains("impl_item"),
1018            "scope should contain impl_item, got: {scope}"
1019        );
1020        assert!(
1021            scope.contains("Foo"),
1022            "scope should contain 'Foo', got: {scope}"
1023        );
1024    }
1025
1026    #[test]
1027    fn scope_chain_python_class_method() {
1028        let source = "class Greeter:\n    def say_hello(self):\n        pass\n";
1029        let (tree, config) = first_def_node(source, "py");
1030        let mut cursor = QueryCursor::new();
1031        let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
1032
1033        // Find the function_definition @def (say_hello), not the class @def
1034        let mut fn_node = None;
1035        while let Some(m) = StreamingIterator::next(&mut matches) {
1036            for cap in m.captures {
1037                let cap_name = &config.query.capture_names()[cap.index as usize];
1038                if *cap_name == "def" && cap.node.kind() == "function_definition" {
1039                    fn_node = Some(cap.node);
1040                }
1041            }
1042        }
1043        let node = fn_node.expect("should find say_hello @def node");
1044        let scope = build_scope_chain(node, source);
1045        assert!(
1046            scope.contains("class_definition"),
1047            "scope should contain class_definition, got: {scope}"
1048        );
1049        assert!(
1050            scope.contains("Greeter"),
1051            "scope should contain 'Greeter', got: {scope}"
1052        );
1053    }
1054
1055    #[test]
1056    fn extract_signature_rust_function() {
1057        let source = "fn greet(name: &str) -> String { name.to_string() }";
1058        let (tree, config) = first_def_node(source, "rs");
1059        let mut cursor = QueryCursor::new();
1060        let mut matches = cursor.matches(&config.query, tree.root_node(), source.as_bytes());
1061
1062        let mut def_node = None;
1063        while let Some(m) = StreamingIterator::next(&mut matches) {
1064            for cap in m.captures {
1065                let cap_name = &config.query.capture_names()[cap.index as usize];
1066                if *cap_name == "def" {
1067                    def_node = Some(cap.node);
1068                }
1069            }
1070        }
1071        let node = def_node.expect("should find @def node");
1072        let sig = extract_signature(node, source).expect("should extract signature");
1073        assert!(
1074            sig.contains("greet"),
1075            "signature should contain 'greet', got: {sig}"
1076        );
1077        assert!(
1078            sig.contains("name: &str"),
1079            "signature should contain parameter, got: {sig}"
1080        );
1081        assert!(
1082            sig.contains("-> String"),
1083            "signature should contain return type, got: {sig}"
1084        );
1085    }
1086
1087    #[test]
1088    fn enriched_content_has_header() {
1089        let source = "fn hello() { println!(\"hi\"); }";
1090        let config = crate::languages::config_for_extension("rs").unwrap();
1091        let chunks = chunk_file(
1092            Path::new("src/main.rs"),
1093            source,
1094            &config,
1095            &ChunkConfig::default(),
1096        );
1097        assert!(!chunks.is_empty());
1098        let chunk = &chunks[0];
1099        assert!(
1100            chunk.enriched_content.starts_with("//"),
1101            "enriched_content should start with '//' header, got: {}",
1102            &chunk.enriched_content[..chunk.enriched_content.len().min(80)]
1103        );
1104        assert!(
1105            chunk.enriched_content.contains("src/main.rs"),
1106            "enriched_content should contain file path"
1107        );
1108        // Raw content should NOT have the header
1109        assert!(
1110            !chunk.content.starts_with("//"),
1111            "raw content should not start with header"
1112        );
1113    }
1114
1115    #[test]
1116    fn sliding_window_enriched_equals_content() {
1117        let source = "let x = 42;\nconsole.log(x);\n";
1118        let chunks = chunk_text(Path::new("test.txt"), source, &ChunkConfig::default());
1119        assert!(!chunks.is_empty());
1120        for chunk in &chunks {
1121            assert_eq!(
1122                chunk.enriched_content, chunk.content,
1123                "sliding window chunks should have enriched_content == content"
1124            );
1125        }
1126    }
1127
1128    #[test]
1129    fn chunks_rdf_xml_and_owl_elements_with_tree_sitter() {
1130        let source = r#"<?xml version="1.0"?>
1131<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
1132         xmlns:owl="http://www.w3.org/2002/07/owl#">
1133  <owl:Class rdf:about="http://example.com/Person"/>
1134  <owl:ObjectProperty rdf:about="http://example.com/knows"/>
1135</rdf:RDF>"#;
1136        let rdf_config = crate::languages::config_for_extension("rdf").unwrap();
1137        let owl_config = crate::languages::config_for_extension("owl").unwrap();
1138
1139        let rdf_chunks = chunk_file(
1140            Path::new("ontology.rdf"),
1141            source,
1142            &rdf_config,
1143            &ChunkConfig::default(),
1144        );
1145        let owl_chunks = chunk_file(
1146            Path::new("ontology.owl"),
1147            source,
1148            &owl_config,
1149            &ChunkConfig::default(),
1150        );
1151
1152        assert!(rdf_chunks.iter().any(|chunk| chunk.name == "owl:Class"));
1153        assert!(
1154            rdf_chunks
1155                .iter()
1156                .any(|chunk| chunk.name == "owl:ObjectProperty")
1157        );
1158        assert!(rdf_chunks.iter().all(|chunk| chunk.kind == "element"));
1159        assert!(owl_chunks.iter().any(|chunk| chunk.name == "owl:Class"));
1160    }
1161
1162    #[test]
1163    fn chunks_turtle_by_rdf_statement_blocks() {
1164        let source = r#"@prefix ex: <http://example.com/> .
1165@prefix owl: <http://www.w3.org/2002/07/owl#> .
1166
1167ex:Person
1168  a owl:Class ;
1169  ex:label "Person" .
1170
1171ex:knows
1172  a owl:ObjectProperty ;
1173  ex:domain ex:Person ;
1174  ex:range ex:Person .
1175"#;
1176
1177        let chunks = chunk_rdf_text(Path::new("ontology.ttl"), source, &ChunkConfig::default());
1178
1179        assert_eq!(chunks.len(), 3);
1180        assert_eq!(chunks[0].kind, "rdf_statements");
1181        assert_eq!(chunks[0].name, "@prefix");
1182        assert_eq!(chunks[1].name, "ex:Person");
1183        assert_eq!(chunks[2].name, "ex:knows");
1184    }
1185
1186    #[test]
1187    fn header_dropped_when_exceeding_max_bytes() {
1188        // Create a chunk that barely fits in max_chunk_bytes, so adding
1189        // a header would push it over the limit.
1190        let tiny_config = ChunkConfig {
1191            max_chunk_bytes: 60,
1192            window_size: 30,
1193            window_overlap: 10,
1194        };
1195        // Source is exactly at max_chunk_bytes — any header would exceed it
1196        let source = "fn f() { let x = 42; return x; }";
1197        assert!(source.len() <= tiny_config.max_chunk_bytes);
1198
1199        let config = crate::languages::config_for_extension("rs").unwrap();
1200        let chunks = chunk_file(
1201            Path::new("long/path/to/file.rs"),
1202            source,
1203            &config,
1204            &tiny_config,
1205        );
1206        assert!(!chunks.is_empty());
1207        let chunk = &chunks[0];
1208        // Header ("// long/path/to/file.rs | defines: ...") + minified content > 60 bytes.
1209        // So enriched_content should fall back to minified content (no header),
1210        // and raw content is preserved as-is.
1211        assert!(
1212            !chunk.enriched_content.starts_with("//"),
1213            "header should be dropped when it would exceed max_chunk_bytes"
1214        );
1215        assert_eq!(chunk.content, source, "raw content should be unchanged");
1216    }
1217
1218    #[test]
1219    fn minify_whitespace_normalizes_indent_and_strips_trailing() {
1220        // 8-space indent → 4-space (halved)
1221        let source = "fn foo() {\n        let x = 1;\n        let y = 2;\n}\n";
1222        let result = minify_whitespace(source);
1223        let lines: Vec<&str> = result.lines().collect();
1224        assert_eq!(
1225            lines[1], "    let x = 1;",
1226            "8-space indent should become 4-space"
1227        );
1228        assert_eq!(
1229            lines[2], "    let y = 2;",
1230            "8-space indent should become 4-space"
1231        );
1232
1233        // Trailing whitespace removed
1234        let with_trailing = "fn bar()   \n    return 1;   \n";
1235        let result2 = minify_whitespace(with_trailing);
1236        assert!(
1237            result2.lines().all(|l| !l.ends_with(' ')),
1238            "trailing whitespace should be stripped"
1239        );
1240
1241        // 3+ consecutive blank lines collapsed to 1
1242        let with_blanks = "a\n\n\n\nb\n";
1243        let result3 = minify_whitespace(with_blanks);
1244        // Should have at most 1 blank line between a and b
1245        let blank_runs: Vec<usize> = {
1246            let mut runs = Vec::new();
1247            let mut count = 0usize;
1248            for line in result3.lines() {
1249                if line.is_empty() {
1250                    count += 1;
1251                } else {
1252                    if count > 0 {
1253                        runs.push(count);
1254                    }
1255                    count = 0;
1256                }
1257            }
1258            runs
1259        };
1260        assert!(
1261            blank_runs.iter().all(|&n| n <= 1),
1262            "3+ blank lines should collapse to 1, got runs: {blank_runs:?}"
1263        );
1264    }
1265
1266    // -------------------------------------------------------------------------
1267    // C1: ContentKind classification tests
1268    // -------------------------------------------------------------------------
1269
1270    /// `test:chunk_content_kind_code_for_rust_file` — chunks from a `.rs` file
1271    /// carry `ContentKind::Code`.
1272    ///
1273    /// Behavior: trigger-fails-on-baseline-then-passes-post-fix.
1274    /// On the baseline (before the `content_kind` field existed) this test did
1275    /// not compile. Post-fix it passes.
1276    #[test]
1277    fn chunk_content_kind_code_for_rust_file() {
1278        let source = "fn hello() {}\n";
1279        let config = crate::languages::config_for_extension("rs").unwrap();
1280        let chunks = chunk_file(
1281            Path::new("src/lib.rs"),
1282            source,
1283            &config,
1284            &ChunkConfig::default(),
1285        );
1286        assert!(!chunks.is_empty(), "expected at least one chunk");
1287        assert!(
1288            chunks.iter().all(|c| c.content_kind == ContentKind::Code),
1289            "all chunks from a .rs file must have ContentKind::Code; got: {:?}",
1290            chunks.iter().map(|c| c.content_kind).collect::<Vec<_>>()
1291        );
1292    }
1293
1294    /// `test:chunk_content_kind_meta_for_json_file` — chunks from a `.json` file
1295    /// carry `ContentKind::Meta`.
1296    #[test]
1297    fn chunk_content_kind_meta_for_json_file() {
1298        let source = r#"{"key": "value", "answer": 42}"#;
1299        let config = crate::languages::config_for_extension("json").unwrap();
1300        let chunks = chunk_file(
1301            Path::new("data.json"),
1302            source,
1303            &config,
1304            &ChunkConfig::default(),
1305        );
1306        assert!(!chunks.is_empty(), "expected at least one chunk");
1307        assert!(
1308            chunks.iter().all(|c| c.content_kind == ContentKind::Meta),
1309            "all chunks from a .json file must have ContentKind::Meta; got: {:?}",
1310            chunks.iter().map(|c| c.content_kind).collect::<Vec<_>>()
1311        );
1312    }
1313
1314    /// `test:chunk_content_kind_docs_for_md_file` — chunks from a `.md` file
1315    /// carry `ContentKind::Docs`.
1316    #[test]
1317    fn chunk_content_kind_docs_for_md_file() {
1318        let source = "# Title\n\nSome prose content.\n";
1319        let config = crate::languages::config_for_extension("md").unwrap();
1320        let chunks = chunk_file(
1321            Path::new("README.md"),
1322            source,
1323            &config,
1324            &ChunkConfig::default(),
1325        );
1326        assert!(!chunks.is_empty(), "expected at least one chunk");
1327        assert!(
1328            chunks.iter().all(|c| c.content_kind == ContentKind::Docs),
1329            "all chunks from a .md file must have ContentKind::Docs; got: {:?}",
1330            chunks.iter().map(|c| c.content_kind).collect::<Vec<_>>()
1331        );
1332    }
1333
1334    /// `test:chunk_content_kind_meta_for_yaml_toml_xml` — chunks from `.yaml`,
1335    /// `.toml`, and `.xml` files all carry `ContentKind::Meta`.
1336    #[test]
1337    fn chunk_content_kind_meta_for_yaml_toml_xml() {
1338        // YAML
1339        let yaml_source = "key: value\nanother: 42\n";
1340        let yaml_config = crate::languages::config_for_extension("yaml").unwrap();
1341        let yaml_chunks = chunk_file(
1342            Path::new("config.yaml"),
1343            yaml_source,
1344            &yaml_config,
1345            &ChunkConfig::default(),
1346        );
1347        assert!(!yaml_chunks.is_empty(), "expected yaml chunks");
1348        assert!(
1349            yaml_chunks
1350                .iter()
1351                .all(|c| c.content_kind == ContentKind::Meta),
1352            "yaml chunks must be Meta; got: {:?}",
1353            yaml_chunks
1354                .iter()
1355                .map(|c| c.content_kind)
1356                .collect::<Vec<_>>()
1357        );
1358
1359        // TOML
1360        let toml_source = "[section]\nkey = \"value\"\n";
1361        let toml_config = crate::languages::config_for_extension("toml").unwrap();
1362        let toml_chunks = chunk_file(
1363            Path::new("Cargo.toml"),
1364            toml_source,
1365            &toml_config,
1366            &ChunkConfig::default(),
1367        );
1368        assert!(!toml_chunks.is_empty(), "expected toml chunks");
1369        assert!(
1370            toml_chunks
1371                .iter()
1372                .all(|c| c.content_kind == ContentKind::Meta),
1373            "toml chunks must be Meta; got: {:?}",
1374            toml_chunks
1375                .iter()
1376                .map(|c| c.content_kind)
1377                .collect::<Vec<_>>()
1378        );
1379
1380        // XML
1381        let xml_source = r#"<?xml version="1.0"?><root><item>hello</item></root>"#;
1382        let xml_config = crate::languages::config_for_extension("xml").unwrap();
1383        let xml_chunks = chunk_file(
1384            Path::new("data.xml"),
1385            xml_source,
1386            &xml_config,
1387            &ChunkConfig::default(),
1388        );
1389        assert!(!xml_chunks.is_empty(), "expected xml chunks");
1390        assert!(
1391            xml_chunks
1392                .iter()
1393                .all(|c| c.content_kind == ContentKind::Meta),
1394            "xml chunks must be Meta; got: {:?}",
1395            xml_chunks
1396                .iter()
1397                .map(|c| c.content_kind)
1398                .collect::<Vec<_>>()
1399        );
1400    }
1401
1402    /// `test:content_kind_from_extension_covers_code_docs_meta` — the
1403    /// `ContentKind::from_extension` classifier returns the right variant for
1404    /// at least one representative extension from each category.
1405    #[test]
1406    fn content_kind_from_extension_covers_code_docs_meta() {
1407        // Code
1408        for ext in [
1409            "rs", "py", "ts", "go", "java", "cpp", "sh", "rb", "kt", "swift", "scala",
1410        ] {
1411            assert_eq!(
1412                ContentKind::from_extension(ext),
1413                ContentKind::Code,
1414                ".{ext} should be Code"
1415            );
1416        }
1417        // Docs
1418        for ext in ["md", "rst", "txt", "adoc", "org"] {
1419            assert_eq!(
1420                ContentKind::from_extension(ext),
1421                ContentKind::Docs,
1422                ".{ext} should be Docs"
1423            );
1424        }
1425        // Meta
1426        for ext in [
1427            "json", "yaml", "yml", "toml", "xml", "lock", "snap", "csv", "tsv", "proto",
1428        ] {
1429            assert_eq!(
1430                ContentKind::from_extension(ext),
1431                ContentKind::Meta,
1432                ".{ext} should be Meta"
1433            );
1434        }
1435    }
1436
1437    /// `test:sliding_window_chunks_carry_content_kind` — the fallback
1438    /// sliding-window path (text files) also propagates `content_kind`.
1439    #[test]
1440    fn sliding_window_chunks_carry_content_kind() {
1441        // .txt (Docs) fallback — chunk_source_for_path calls chunk_text which
1442        // calls sliding_windows.
1443        let source = "just some plain text with no tree-sitter grammar support\n";
1444        let chunks = chunk_source_for_path(
1445            Path::new("notes.txt"),
1446            source,
1447            false,
1448            &ChunkConfig::default(),
1449        );
1450        assert!(!chunks.is_empty(), "expected at least one chunk");
1451        assert!(
1452            chunks.iter().all(|c| c.content_kind == ContentKind::Docs),
1453            "notes.txt chunks must be Docs; got: {:?}",
1454            chunks.iter().map(|c| c.content_kind).collect::<Vec<_>>()
1455        );
1456    }
1457}