ripvec-core 4.1.0

//! Static encoder: in-process `StaticEmbedModel` reimplementation.
//!
//! Port of `~/src/semble/src/semble/index/dense.py`. Wraps
//! [`StaticEmbedModel`] loaded with `minishlab/potion-base-32M`
//! (256-dim, L2-normalized). Implements [`VectorEncoder`] for the
//! `--model ripvec` path. CPU-only; no batching ring buffer.
//!
//! Default was bumped to `potion-base-32M` in v1.3.0 after the
//! gutenberg + python-repos matrix showed 32M winning prose by
//! 0.058 NDCG@10 while losing code by only 0.004 — a clear
//! single-default win once the i64 mapping bug and the reranker
//! pooler / sigmoid / truncation bugs were fixed. The code-tuned
//! `potion-code-16M` is still available via `--model-repo`.
//!
//! ## Why not `model2vec-rs`?
//!
//! The previous wave used the upstream `model2vec-rs` crate. Two real
//! problems pushed us to reimplement (see
//! `crates/ripvec-core/src/encoder/semble/static_model.rs` for the
//! full design rationale):
//!
//! 1. `model2vec_rs::StaticModel::encode_with_args` runs `pool_ids`
//!    in a serial inner loop while `tokenizers::encode_batch_fast`
//!    spawns its own rayon pool. Wrapping that path in our outer
//!    `par_chunks` produced 60% `__psynch_cvwait` in the linux-corpus
//!    profile — nested rayon scopes parking on each other. The
//!    reimplementation does ONE big tokenize plus a `par_iter` over
//!    `pool_ids` — no nested rayon, no parking.
//! 2. `model2vec-rs 0.2` pinned `ndarray 0.15`; ripvec-core uses
//!    `ndarray 0.17`. The two `Array2<f32>` types were not
//!    interchangeable, forcing a `Vec<Vec<f32>>` shim. Owning the
//!    load path eliminates the mismatch.

use std::path::{Path, PathBuf};
use std::sync::Mutex;

use crossbeam_channel::bounded;
use hf_hub::api::sync::Api;
use rayon::prelude::*;

use streaming_iterator::StreamingIterator;
use tree_sitter::{Parser, QueryCursor};

use crate::chunk::{CodeChunk, ContentKind};
use crate::embed::SearchConfig;
use crate::encoder::VectorEncoder;
use crate::encoder::ripvec::chunking::{DEFAULT_DESIRED_CHUNK_CHARS, chunk_source};
use crate::encoder::ripvec::static_model::StaticEmbedModel;
use crate::languages::{config_for_extension, lsp_symbol_kind_for_node_kind};
use crate::profile::Profiler;
use crate::walk::collect_files_with_options;

/// Encode batch size used by the streaming pipeline. Matches
/// `StaticEmbedModel`'s internal `BATCH_SIZE` so each emitted batch
/// is exactly one `encode_batch_fast` call's worth of work.
const PIPELINE_BATCH_SIZE: usize = 1024;

/// Number of full batches allowed in-flight from chunker to encoder.
/// Provides enough pipeline depth for the encoder to stay busy while
/// the chunker fills the next batch; small enough that peak memory
/// stays bounded.
const PIPELINE_RING_SIZE: usize = 4;

/// Default model repo identifier for the ripvec path. This is the HF
/// repo string used as `identity()`; the loader reads files from a
/// local path passed via `--model-repo`.
pub const DEFAULT_MODEL_REPO: &str = "minishlab/potion-base-32M";

/// Default hidden dimension for [`DEFAULT_MODEL_REPO`].
pub const DEFAULT_HIDDEN_DIM: usize = 256;

/// Maximum source file size to read, in bytes (mirrors semble's
/// `_MAX_FILE_BYTES = 1_000_000` from `index/create.py:16`).
const MAX_FILE_BYTES: u64 = 1_000_000;

/// CPU-only static encoder.
///
/// Owns a loaded [`StaticEmbedModel`] plus identity metadata. The
/// embedder is constructed by `main.rs::load_pipeline` via
/// [`StaticEncoder::from_pretrained`], passing either a local path
/// containing the Model2Vec files or (planned) an HF repo ID.
pub struct StaticEncoder {
    model: StaticEmbedModel,
    model_repo: String,
    hidden_dim: usize,
}

impl StaticEncoder {
    /// Encode a query string into a single embedding row.
    ///
    /// Used by `RipvecIndex::search` for hybrid/semantic dispatch.
    #[must_use]
    pub fn encode_query(&self, query: &str) -> Vec<f32> {
        self.model.encode_query(query)
    }

    /// Load a model by HuggingFace repo ID or local path.
    ///
    /// Two acceptance shapes:
    ///
    /// 1. **Local path** — if `model_repo` names an existing directory,
    ///    load directly from it. Used by the parity test fixture path
    ///    (`/tmp/potion-base-32M`) and any user pre-staging files.
    /// 2. **HuggingFace repo ID** — otherwise treat as `org/repo`,
    ///    download `config.json` / `tokenizer.json` / `model.safetensors`
    ///    via `hf-hub` into `~/.cache/huggingface/hub/`, and load from
    ///    there. Matches `load_classic_cpu` / `load_modernbert_cpu`'s
    ///    behaviour so the user-facing API is consistent: bare `--model
    ///    ripvec` with no `--model-repo` flag works.
    ///
    /// # Errors
    ///
    /// Propagates the underlying I/O, download, or parse error if the
    /// files cannot be obtained or the safetensors layout is
    /// unrecognized.
    pub fn from_pretrained(model_repo: &str) -> crate::Result<Self> {
        let resolved = Self::resolve_model_dir(model_repo)?;
        let model = StaticEmbedModel::from_path(&resolved, Some(true))
            .map_err(|e| crate::Error::Other(anyhow::anyhow!("static model load failed: {e}")))?;
        let hidden_dim = model.hidden_dim();
        Ok(Self {
            model,
            model_repo: model_repo.to_string(),
            hidden_dim,
        })
    }

    /// Resolve `model_repo` to a directory containing the model files.
    ///
    /// If `model_repo` is an existing local directory, returns it as-is.
    /// Otherwise downloads via `hf-hub` and returns the cache directory.
    fn resolve_model_dir(model_repo: &str) -> crate::Result<PathBuf> {
        let local = Path::new(model_repo);
        if local.is_dir() {
            return Ok(local.to_path_buf());
        }

        // HuggingFace repo path. Download the three required files and
        // return the directory `hf-hub` cached them into. All files
        // land in the same snapshot directory.
        let api = Api::new().map_err(|e| crate::Error::Download(e.to_string()))?;
        let repo = api.model(model_repo.to_string());
        let _ = repo
            .get("config.json")
            .map_err(|e| crate::Error::Download(e.to_string()))?;
        let _ = repo
            .get("tokenizer.json")
            .map_err(|e| crate::Error::Download(e.to_string()))?;
        let weights_path = repo
            .get("model.safetensors")
            .map_err(|e| crate::Error::Download(e.to_string()))?;
        // hf-hub returns the file path; the snapshot directory is its parent.
        weights_path
            .parent()
            .map(std::path::Path::to_path_buf)
            .ok_or_else(|| {
                crate::Error::Other(anyhow::anyhow!(
                    "hf-hub returned root path for {model_repo}; cannot resolve snapshot dir"
                ))
            })
    }

    /// Chunk + embed an explicit list of files, skipping the walk.
    ///
    /// Used by [`RipvecIndex::apply_diff`](crate::encoder::ripvec::index::RipvecIndex::apply_diff)
    /// to incrementally re-embed just the files that changed since the
    /// last reconcile. `root` is the corpus root the paths are
    /// relative to (used for the chunker's `rel_path` field, matching
    /// what [`VectorEncoder::embed_root`] writes for unchanged files).
    ///
    /// Returns `(chunks, embeddings)` in flat lists; ordering mirrors
    /// the per-file traversal order of `paths`. Files that fail to
    /// read or chunk are silently skipped (same policy as
    /// [`chunk_one_file`]).
    ///
    /// # Why a separate method
    ///
    /// [`VectorEncoder::embed_root`] is a heavy three-stage pipeline
    /// optimized for full-corpus builds (thousands of files). For the
    /// "1-50 files changed" case that drives reconciliation, the
    /// sequential single-batch path here is simpler and faster: no
    /// rayon pool spin-up, no bounded channels, no inter-stage
    /// hand-off cost. The batch encode is a single [`encode_batch`]
    /// call.
    ///
    /// # Errors
    ///
    /// Returns the underlying error if `encode_batch` fails.
    pub fn embed_paths(
        &self,
        root: &Path,
        paths: &[std::path::PathBuf],
        profiler: &Profiler,
    ) -> crate::Result<(Vec<CodeChunk>, Vec<Vec<f32>>)> {
        let _guard = profiler.phase("embed_paths");
        let mut chunks_out: Vec<CodeChunk> = Vec::new();
        let mut texts: Vec<String> = Vec::new();
        for path in paths {
            let (file_chunks, file_texts) = chunk_one_file(root, path);
            chunks_out.extend(file_chunks);
            texts.extend(file_texts);
        }
        if chunks_out.is_empty() {
            return Ok((Vec::new(), Vec::new()));
        }
        let text_refs: Vec<&str> = texts.iter().map(String::as_str).collect();
        let embeddings = self.model.encode_batch(&text_refs);
        debug_assert_eq!(embeddings.len(), chunks_out.len());
        Ok((chunks_out, embeddings))
    }
}

impl VectorEncoder for StaticEncoder {
    /// Three-stage bounded-queue pipeline:
    ///
    /// 1. **Chunk producer** — rayon `par_iter` over the file list. Each
    ///    file is read, parsed by tree-sitter (or line-merged on
    ///    fallback), and emitted as `(CodeChunk, String)` pairs into a
    ///    bounded channel of capacity `PIPELINE_BATCH_SIZE * 8`.
    /// 2. **Batch accumulator** — a single scoped thread drains the
    ///    chunk channel, packs `PIPELINE_BATCH_SIZE` pairs per batch,
    ///    and forwards into a bounded channel of capacity
    ///    `PIPELINE_RING_SIZE`.
    /// 3. **Encode worker** — a single scoped thread receives batches
    ///    and calls `StaticEmbedModel::encode_batch`, whose internal
    ///    `par_iter` lights up rayon for the pool_ids kernel.
    ///
    /// Why this shape:
    ///
    /// - The previous "chunk all, then embed all" implementation held
    ///   the entire `Vec<String>` of chunk contents in memory between
    ///   phases. On the linux corpus that was ~400 MB peak. The
    ///   bounded queues cap in-flight memory at
    ///   `PIPELINE_BATCH_SIZE * 8 + PIPELINE_RING_SIZE * PIPELINE_BATCH_SIZE`
    ///   chunks regardless of corpus size — under 15 MB.
    /// - The chunk phase (13s on linux) is hidden inside the embed
    ///   phase (70s) instead of serializing before it. Pre-pipeline
    ///   profile showed user-time at 394s on 82s wall = 4.8x
    ///   parallelism on 12 cores; pipeline lets idle cores chew on
    ///   chunking while embed runs.
    /// - Mirrors `embed::embed_all_streaming`'s shape so the two
    ///   pipelines (BERT + semble) share architectural conventions.
    fn embed_root(
        &self,
        root: &Path,
        cfg: &SearchConfig,
        profiler: &Profiler,
    ) -> crate::Result<(Vec<CodeChunk>, Vec<Vec<f32>>)> {
        // Phase 1: walk (still serial-to-pipeline because we need the
        // full file list to par_iter over; the walk itself is rayon).
        let walk_options = cfg.walk_options();
        let file_paths = {
            let _guard = profiler.phase("walk");
            collect_files_with_options(root, &walk_options)
        };
        if file_paths.is_empty() {
            return Ok((Vec::new(), Vec::new()));
        }

        // Bounded channels. See module constants for the rationale on
        // PIPELINE_BATCH_SIZE and PIPELINE_RING_SIZE.
        let (chunk_tx, chunk_rx) = bounded::<(CodeChunk, String)>(PIPELINE_BATCH_SIZE * 8);
        let (batch_tx, batch_rx) = bounded::<Vec<(CodeChunk, String)>>(PIPELINE_RING_SIZE);

        // The encoder stage writes ordered output behind a Mutex. Order
        // across files isn't meaningful (RipvecIndex doesn't rely on
        // chunk order), only the chunk[i] <-> embedding[i] pairing
        // matters — which we preserve trivially by pushing in lockstep.
        let output: Mutex<Vec<(CodeChunk, Vec<f32>)>> = Mutex::new(Vec::new());
        let model = &self.model;

        // Stage 1 runs on a DEDICATED rayon thread pool. If we used
        // the global pool, Stage 1's par_iter workers would park on
        // full `chunk_tx.send()` calls, and Stage 3's
        // `encode_batch` → `pool_ids` par_iter would have no rayon
        // workers available (they're all parked). That's a classic
        // nested-rayon deadlock — observed in profiling as PID stuck
        // at 0% CPU with 16 parked threads.
        //
        // Half the cores for chunking, half remain in the global pool
        // for the encode worker's pool_ids. The chunk phase (tree-
        // sitter + I/O bound) doesn't need full parallelism to
        // pipeline cleanly behind embed.
        let num_cores = rayon::current_num_threads().max(2);
        let chunk_threads = (num_cores / 2).max(1);
        let chunk_pool = rayon::ThreadPoolBuilder::new()
            .num_threads(chunk_threads)
            .thread_name(|i| format!("semble-chunk-{i}"))
            .build()
            .map_err(|e| crate::Error::Other(anyhow::anyhow!("chunk thread pool build: {e}")))?;

        let _phase_guard = profiler.phase("pipeline");
        std::thread::scope(|scope| {
            // Stage 1: chunk producer on the dedicated pool.
            let chunk_tx_owned = chunk_tx;
            scope.spawn(move || {
                chunk_pool.install(|| {
                    file_paths.par_iter().for_each(|full| {
                        let (chunks, contents) = chunk_one_file(root, full);
                        for (chunk, content) in chunks.into_iter().zip(contents) {
                            if chunk_tx_owned.send((chunk, content)).is_err() {
                                return;
                            }
                        }
                    });
                });
                // chunk_tx_owned drops here, closing the channel.
            });

            // Stage 2: batch accumulator.
            let batch_tx_owned = batch_tx;
            scope.spawn(move || {
                let mut buf: Vec<(CodeChunk, String)> = Vec::with_capacity(PIPELINE_BATCH_SIZE);
                for pair in chunk_rx {
                    buf.push(pair);
                    if buf.len() >= PIPELINE_BATCH_SIZE {
                        let batch =
                            std::mem::replace(&mut buf, Vec::with_capacity(PIPELINE_BATCH_SIZE));
                        if batch_tx_owned.send(batch).is_err() {
                            return;
                        }
                    }
                }
                if !buf.is_empty() {
                    let _ = batch_tx_owned.send(buf);
                }
                // batch_tx_owned drops here, closing the channel.
            });

            // Stage 3: encode worker.
            scope.spawn(|| {
                for batch in batch_rx {
                    if batch.is_empty() {
                        continue;
                    }
                    let mut chunks = Vec::with_capacity(batch.len());
                    let mut texts: Vec<String> = Vec::with_capacity(batch.len());
                    for (chunk, text) in batch {
                        chunks.push(chunk);
                        texts.push(text);
                    }
                    let text_refs: Vec<&str> = texts.iter().map(String::as_str).collect();
                    let embeddings = model.encode_batch(&text_refs);
                    debug_assert_eq!(embeddings.len(), chunks.len());
                    let mut out = output.lock().expect("output mutex poisoned");
                    for (chunk, emb) in chunks.into_iter().zip(embeddings) {
                        out.push((chunk, emb));
                    }
                }
            });
        });

        let collected = output.into_inner().expect("output mutex poisoned");
        let mut chunks_out = Vec::with_capacity(collected.len());
        let mut embs_out = Vec::with_capacity(collected.len());
        for (chunk, emb) in collected {
            chunks_out.push(chunk);
            embs_out.push(emb);
        }
        Ok((chunks_out, embs_out))
    }

    fn hidden_dim(&self) -> usize {
        self.hidden_dim
    }

    fn identity(&self) -> &str {
        &self.model_repo
    }
}

/// A resolved symbol capture: name text, its byte span, and the LSP SymbolKind
/// of its enclosing definition node.
///
/// Produced by [`extract_name_captures`] from a single query match that has
/// both a `@name` and a `@def` capture.
struct NameCapture {
    /// Byte offset of the `@name` node's start within the source.
    start_byte: usize,
    /// Byte offset one past the `@name` node's end.
    end_byte: usize,
    /// Identifier text extracted from the `@name` capture.
    name: String,
    /// LSP SymbolKind derived from the `@def` node's tree-sitter node kind.
    lsp_kind: u32,
}

/// Extract `@name` + `@def` capture pairs from a tree-sitter parse of `source`
/// using the language config's compiled query.
///
/// Returns a list of [`NameCapture`] for every match that has both a `@name`
/// and a `@def` capture.  The list is sorted by `start_byte` so callers can do
/// a linear scan per chunk boundary.
///
/// Performs exactly one parse and one query execution per `chunk_one_file`
/// call — O(1) parses regardless of the number of chunks.
fn extract_name_captures(
    source: &str,
    lang_cfg: &crate::languages::LangConfig,
) -> Vec<NameCapture> {
    let mut parser = Parser::new();
    if parser.set_language(&lang_cfg.language).is_err() {
        return Vec::new();
    }
    let Some(tree) = parser.parse(source, None) else {
        return Vec::new();
    };
    let mut cursor = QueryCursor::new();
    let mut matches = cursor.matches(&lang_cfg.query, tree.root_node(), source.as_bytes());
    let capture_names = lang_cfg.query.capture_names();
    let mut result: Vec<NameCapture> = Vec::new();
    while let Some(m) = matches.next() {
        // Collect @name and @def from this match.
        let mut name_start = 0usize;
        let mut name_end = 0usize;
        let mut name_text = String::new();
        let mut def_kind = "";
        let mut has_name = false;
        let mut has_def = false;

        for cap in m.captures {
            let cap_name = &capture_names[cap.index as usize];
            if *cap_name == "name" {
                let start = cap.node.start_byte();
                let end = cap.node.end_byte();
                if end <= source.len() {
                    name_start = start;
                    name_end = end;
                    name_text = source[start..end].to_string();
                    has_name = true;
                }
            } else if *cap_name == "def" {
                def_kind = cap.node.kind();
                has_def = true;
            }
        }

        if has_name {
            result.push(NameCapture {
                start_byte: name_start,
                end_byte: name_end,
                name: name_text,
                // If there's no @def capture, fall back to Variable (pre-B1 default).
                lsp_kind: if has_def {
                    lsp_symbol_kind_for_node_kind(def_kind)
                } else {
                    crate::languages::lsp_symbol_kind::VARIABLE
                },
            });
        }
    }
    // Sort by byte position so we can scan linearly per boundary.
    result.sort_unstable_by_key(|c| c.start_byte);
    result
}

/// Find the best name and LSP SymbolKind for a chunk covering
/// `[chunk_start, chunk_end)` bytes.
///
/// "Best" = the first [`NameCapture`] whose `start_byte` falls inside the
/// chunk's byte range. Returns `("", VARIABLE)` if none found (graceful
/// fallback preserving pre-B1 default kind).
fn name_for_chunk(captures: &[NameCapture], chunk_start: usize, chunk_end: usize) -> (&str, u32) {
    for cap in captures {
        if cap.start_byte >= chunk_start && cap.end_byte <= chunk_end {
            return (cap.name.as_str(), cap.lsp_kind);
        }
        // Since captures are sorted by start byte, once we pass chunk_end
        // there can be no more candidates.
        if cap.start_byte >= chunk_end {
            break;
        }
    }
    ("", crate::languages::lsp_symbol_kind::VARIABLE)
}

/// Chunk one file. Returns `(file_chunks, file_contents)` — empty
/// when the file is too large, can't be read, or has no chunks.
fn chunk_one_file(root: &Path, full: &Path) -> (Vec<CodeChunk>, Vec<String>) {
    match std::fs::metadata(full) {
        Ok(meta) if meta.len() > MAX_FILE_BYTES => return (Vec::new(), Vec::new()),
        Err(_) => return (Vec::new(), Vec::new()),
        _ => {}
    }
    let Ok(source) = std::fs::read_to_string(full) else {
        return (Vec::new(), Vec::new());
    };

    let ext = full
        .extension()
        .and_then(|e| e.to_str())
        .unwrap_or_default();
    let lang_cfg = config_for_extension(ext);
    let language = lang_cfg.as_ref().map(|c| &c.language);

    // Parse once per file to collect all `@name` + `@def` captures for name
    // and kind population.  Falls back to an empty list when there is no
    // language config or the parse fails — chunk names remain "" and kind
    // falls back to Variable.
    let name_captures: Vec<NameCapture> = lang_cfg
        .as_deref()
        .map(|cfg| extract_name_captures(&source, cfg))
        .unwrap_or_default();

    let rel_path = full
        .strip_prefix(root)
        .unwrap_or(full)
        .display()
        .to_string();

    let content_kind = ContentKind::from_extension(ext);
    let boundaries = chunk_source(&source, language, DEFAULT_DESIRED_CHUNK_CHARS);
    let mut chunks = Vec::with_capacity(boundaries.len());
    let mut contents = Vec::with_capacity(boundaries.len());
    for b in boundaries {
        let text = b.content(&source).to_string();
        if text.trim().is_empty() {
            continue;
        }
        let (name, lsp_kind) = name_for_chunk(&name_captures, b.start_byte, b.end_byte);
        let name = name.to_string();
        // Store the LSP SymbolKind as a decimal string so downstream consumers
        // (e.g., ripvec-mcp's lsp_workspace_symbols) can parse it directly
        // without re-running the mapping table. Empty string is preserved for
        // chunks without a recognised definition (consistent with pre-B2 behaviour).
        let kind = if name.is_empty() {
            String::new()
        } else {
            lsp_kind.to_string()
        };
        contents.push(text.clone());
        chunks.push(CodeChunk {
            file_path: rel_path.clone(),
            name,
            kind,
            content_kind,
            start_line: b.start_line,
            end_line: b.end_line,
            // Dense/AST-merge path does not track the identifier line separately;
            // fall back to start_line per CodeChunk.symbol_line documentation.
            symbol_line: b.start_line,
            content: text.clone(),
            enriched_content: text,
            qualified_name: None,
        });
    }
    (chunks, contents)
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::encoder::VectorEncoder;
    use std::io::Write as _;

    /// `test:chunk_one_file_populates_name_from_tree_sitter` — chunk_one_file
    /// must populate `name` from tree-sitter when the source contains a
    /// recognisable definition.
    #[test]
    fn chunk_one_file_populates_name_from_tree_sitter() {
        let source = "pub fn add(a: i32, b: i32) -> i32 { a + b }\n";
        let dir = tempfile::tempdir().expect("tempdir");
        let path = dir.path().join("add.rs");
        {
            let mut f = std::fs::File::create(&path).expect("create");
            f.write_all(source.as_bytes()).expect("write");
        }
        let (chunks, _) = chunk_one_file(dir.path(), &path);
        assert!(
            !chunks.is_empty(),
            "expected at least one chunk from Rust source"
        );
        assert!(
            chunks.iter().any(|c| c.name == "add"),
            "expected at least one chunk with name 'add'; got names: {:?}",
            chunks.iter().map(|c| c.name.as_str()).collect::<Vec<_>>()
        );
    }

    /// `test:chunk_one_file_leaves_name_empty_when_no_identifier` — when the
    /// source has no tree-sitter-recognisable definitions, name stays empty.
    #[test]
    fn chunk_one_file_leaves_name_empty_when_no_identifier() {
        // Only whitespace and comments — no function/struct/enum definitions.
        let source = "// just a comment\n   \n// another comment\n";
        let dir = tempfile::tempdir().expect("tempdir");
        let path = dir.path().join("comments.rs");
        {
            let mut f = std::fs::File::create(&path).expect("create");
            f.write_all(source.as_bytes()).expect("write");
        }
        let (chunks, _) = chunk_one_file(dir.path(), &path);
        // Either no chunks at all, or all chunks have an empty name.
        for c in &chunks {
            assert!(
                c.name.is_empty(),
                "expected empty name for comment-only source; got {:?}",
                c.name
            );
        }
    }

    /// `StaticEncoder` implements `VectorEncoder` + Send + Sync.
    /// Compile-time check (`test:static-encoder-implements-vector-encoder`).
    #[test]
    fn static_encoder_implements_vector_encoder() {
        fn assert_trait_object<T: VectorEncoder + Send + Sync>() {}
        assert_trait_object::<StaticEncoder>();
    }

    // -------------------------------------------------------------------------
    // B2: chunk_one_file kind-tagging tests
    // -------------------------------------------------------------------------

    /// Helper: write a temp file and return `(dir, path)`.
    fn write_temp(source: &str, filename: &str) -> (tempfile::TempDir, std::path::PathBuf) {
        let dir = tempfile::tempdir().expect("tempdir");
        let path = dir.path().join(filename);
        std::fs::write(&path, source).expect("write");
        (dir, path)
    }

    /// `test:chunk_one_file_populates_kind_for_rust_struct` — `chunk_one_file`
    /// emits a chunk whose `kind` is `"23"` (LSP Struct) for a `pub struct`.
    ///
    /// Behavior: trigger-fails-on-baseline-then-passes-post-fix.
    /// On the baseline, `kind` was always `""` (empty string from the semble
    /// chunker), so this test fails. Post-B2 the kind is the LSP numeric string.
    #[test]
    fn chunk_one_file_populates_kind_for_rust_struct() {
        let source = "pub struct Foo { x: i32 }\n";
        let (dir, path) = write_temp(source, "foo.rs");
        let (chunks, _) = chunk_one_file(dir.path(), &path);
        let struct_chunk = chunks.iter().find(|c| c.name == "Foo");
        assert!(
            struct_chunk.is_some(),
            "expected a chunk named 'Foo'; got: {:?}",
            chunks.iter().map(|c| c.name.as_str()).collect::<Vec<_>>()
        );
        let kind = &struct_chunk.unwrap().kind;
        assert_eq!(
            kind.as_str(),
            "23",
            "struct_item must emit LSP SymbolKind::Struct (23); got: {kind:?}"
        );
    }

    /// `test:chunk_one_file_populates_kind_for_rust_trait` — `chunk_one_file`
    /// emits a chunk whose `kind` is `"11"` (LSP Interface) for a trait.
    #[test]
    fn chunk_one_file_populates_kind_for_rust_trait() {
        let source = "pub trait MyTrait { fn method(&self); }\n";
        let (dir, path) = write_temp(source, "trait.rs");
        let (chunks, _) = chunk_one_file(dir.path(), &path);
        let trait_chunk = chunks.iter().find(|c| c.name == "MyTrait");
        assert!(
            trait_chunk.is_some(),
            "expected a chunk named 'MyTrait'; got: {:?}",
            chunks.iter().map(|c| c.name.as_str()).collect::<Vec<_>>()
        );
        let kind = &trait_chunk.unwrap().kind;
        assert_eq!(
            kind.as_str(),
            "11",
            "trait_item must emit LSP SymbolKind::Interface (11); got: {kind:?}"
        );
    }

    /// `test:chunk_one_file_kind_distinct_from_variable_default` — after B2,
    /// named chunks must not carry the old hardcoded `""` (empty) kind.
    ///
    /// Pre-B2 all chunks from the semble AST-merge path had `kind: String::new()`
    /// (= `""`). This test ensures that chunks whose name is non-empty carry a
    /// meaningful, non-empty LSP kind string.
    ///
    /// Note: The semble AST-merge chunker packs adjacent small definitions into a
    /// single chunk and assigns only the FIRST capture's name. The kind test
    /// therefore validates the overall invariant — named chunks have non-empty
    /// kinds — rather than testing each definition independently (which requires
    /// definitions large enough to occupy distinct chunks).
    #[test]
    fn chunk_one_file_kind_distinct_from_variable_default() {
        // Use a file with a single, definitively-named struct so the chunk
        // carries a meaningful kind. The semble chunker will emit one chunk
        // with name "Qux" and kind "23" (Struct).
        let source = "pub struct Qux { x: i32, y: i32 }\n";
        let (dir, path) = write_temp(source, "qux.rs");
        let (chunks, _) = chunk_one_file(dir.path(), &path);

        // Find the named chunk.
        let named_chunks: Vec<_> = chunks.iter().filter(|c| !c.name.is_empty()).collect();
        assert!(
            !named_chunks.is_empty(),
            "expected at least one named chunk from Rust source with struct definition"
        );

        // Every named chunk must have a non-empty kind (pre-B2 regression: kind was "").
        for c in &named_chunks {
            assert!(
                !c.kind.is_empty(),
                "named chunk '{}' must have non-empty kind (pre-B2 regression); got empty",
                c.name
            );
        }

        // The struct chunk specifically must have kind "23" (LSP Struct).
        let qux = named_chunks.iter().find(|c| c.name == "Qux");
        if let Some(c) = qux {
            assert_eq!(
                c.kind.as_str(),
                "23",
                "Qux (struct_item) must emit LSP SymbolKind::Struct (23); got: {:?}",
                c.kind
            );
        }
    }

    /// `from_pretrained` returns the right hidden_dim from a probe encode.
    /// Ignored by default because it requires a model download (~16 MB).
    ///
    /// Corresponds to acceptance `test:static-encoder-hidden-dim-256` and
    /// `test:static-encoder-loads-potion-code-16m` and
    /// `test:static-encoder-output-is-l2-normalized`.
    #[test]
    #[ignore = "requires local model files at RIPVEC_SEMBLE_MODEL_PATH"]
    fn static_encoder_loads_potion_code_16m() {
        let Ok(path) = std::env::var("RIPVEC_SEMBLE_MODEL_PATH") else {
            eprintln!("RIPVEC_SEMBLE_MODEL_PATH not set; skipping");
            return;
        };
        let enc = StaticEncoder::from_pretrained(&path).expect("model load should succeed");
        assert_eq!(enc.hidden_dim(), DEFAULT_HIDDEN_DIM);
        // identity() reflects what the caller passed (typically the
        // local path under test).
        assert_eq!(enc.identity(), path);

        // Verify L2-normalized output via the public encode_query path.
        let row = enc.encode_query("hello world");
        let norm: f32 = row.iter().map(|x| x * x).sum::<f32>().sqrt();
        assert!(
            (norm - 1.0).abs() < 1e-3,
            "expected L2-normalized output; got norm={norm}"
        );
    }
}