ripvec-core 4.1.0

Semantic code + document search engine. Cacheless static-embedding + cross-encoder rerank by default; optional ModernBERT/BGE transformer engines with GPU backends. Tree-sitter chunking, hybrid BM25 + PageRank, composable ranking layers.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
//! Static encoder: in-process `StaticEmbedModel` reimplementation.
//!
//! Port of `~/src/semble/src/semble/index/dense.py`. Wraps
//! [`StaticEmbedModel`] loaded with `minishlab/potion-base-32M`
//! (256-dim, L2-normalized). Implements [`VectorEncoder`] for the
//! `--model ripvec` path. CPU-only; no batching ring buffer.
//!
//! Default was bumped to `potion-base-32M` in v1.3.0 after the
//! gutenberg + python-repos matrix showed 32M winning prose by
//! 0.058 NDCG@10 while losing code by only 0.004 — a clear
//! single-default win once the i64 mapping bug and the reranker
//! pooler / sigmoid / truncation bugs were fixed. The code-tuned
//! `potion-code-16M` is still available via `--model-repo`.
//!
//! ## Why not `model2vec-rs`?
//!
//! The previous wave used the upstream `model2vec-rs` crate. Two real
//! problems pushed us to reimplement (see
//! `crates/ripvec-core/src/encoder/semble/static_model.rs` for the
//! full design rationale):
//!
//! 1. `model2vec_rs::StaticModel::encode_with_args` runs `pool_ids`
//!    in a serial inner loop while `tokenizers::encode_batch_fast`
//!    spawns its own rayon pool. Wrapping that path in our outer
//!    `par_chunks` produced 60% `__psynch_cvwait` in the linux-corpus
//!    profile — nested rayon scopes parking on each other. The
//!    reimplementation does ONE big tokenize plus a `par_iter` over
//!    `pool_ids` — no nested rayon, no parking.
//! 2. `model2vec-rs 0.2` pinned `ndarray 0.15`; ripvec-core uses
//!    `ndarray 0.17`. The two `Array2<f32>` types were not
//!    interchangeable, forcing a `Vec<Vec<f32>>` shim. Owning the
//!    load path eliminates the mismatch.

use std::path::{Path, PathBuf};
use std::sync::Mutex;

use crossbeam_channel::bounded;
use hf_hub::api::sync::Api;
use rayon::prelude::*;

use streaming_iterator::StreamingIterator;
use tree_sitter::{Parser, QueryCursor};

use crate::chunk::{CodeChunk, ContentKind};
use crate::embed::SearchConfig;
use crate::encoder::VectorEncoder;
use crate::encoder::ripvec::chunking::{DEFAULT_DESIRED_CHUNK_CHARS, chunk_source};
use crate::encoder::ripvec::static_model::StaticEmbedModel;
use crate::languages::{config_for_extension, lsp_symbol_kind_for_node_kind};
use crate::profile::Profiler;
use crate::walk::collect_files_with_options;

/// Encode batch size used by the streaming pipeline. Matches
/// `StaticEmbedModel`'s internal `BATCH_SIZE` so each emitted batch
/// is exactly one `encode_batch_fast` call's worth of work.
const PIPELINE_BATCH_SIZE: usize = 1024;

/// Number of full batches allowed in-flight from chunker to encoder.
/// Provides enough pipeline depth for the encoder to stay busy while
/// the chunker fills the next batch; small enough that peak memory
/// stays bounded.
const PIPELINE_RING_SIZE: usize = 4;

/// Default model repo identifier for the ripvec path. This is the HF
/// repo string used as `identity()`; the loader reads files from a
/// local path passed via `--model-repo`.
pub const DEFAULT_MODEL_REPO: &str = "minishlab/potion-base-32M";

/// Default hidden dimension for [`DEFAULT_MODEL_REPO`].
pub const DEFAULT_HIDDEN_DIM: usize = 256;

/// Maximum source file size to read, in bytes (mirrors semble's
/// `_MAX_FILE_BYTES = 1_000_000` from `index/create.py:16`).
const MAX_FILE_BYTES: u64 = 1_000_000;

/// CPU-only static encoder.
///
/// Owns a loaded [`StaticEmbedModel`] plus identity metadata. The
/// embedder is constructed by `main.rs::load_pipeline` via
/// [`StaticEncoder::from_pretrained`], passing either a local path
/// containing the Model2Vec files or (planned) an HF repo ID.
pub struct StaticEncoder {
    model: StaticEmbedModel,
    model_repo: String,
    hidden_dim: usize,
}

impl StaticEncoder {
    /// Encode a query string into a single embedding row.
    ///
    /// Used by `RipvecIndex::search` for hybrid/semantic dispatch.
    #[must_use]
    pub fn encode_query(&self, query: &str) -> Vec<f32> {
        self.model.encode_query(query)
    }

    /// Load a model by HuggingFace repo ID or local path.
    ///
    /// Two acceptance shapes:
    ///
    /// 1. **Local path** — if `model_repo` names an existing directory,
    ///    load directly from it. Used by the parity test fixture path
    ///    (`/tmp/potion-base-32M`) and any user pre-staging files.
    /// 2. **HuggingFace repo ID** — otherwise treat as `org/repo`,
    ///    download `config.json` / `tokenizer.json` / `model.safetensors`
    ///    via `hf-hub` into `~/.cache/huggingface/hub/`, and load from
    ///    there. Matches `load_classic_cpu` / `load_modernbert_cpu`'s
    ///    behaviour so the user-facing API is consistent: bare `--model
    ///    ripvec` with no `--model-repo` flag works.
    ///
    /// # Errors
    ///
    /// Propagates the underlying I/O, download, or parse error if the
    /// files cannot be obtained or the safetensors layout is
    /// unrecognized.
    pub fn from_pretrained(model_repo: &str) -> crate::Result<Self> {
        let resolved = Self::resolve_model_dir(model_repo)?;
        let model = StaticEmbedModel::from_path(&resolved, Some(true))
            .map_err(|e| crate::Error::Other(anyhow::anyhow!("static model load failed: {e}")))?;
        let hidden_dim = model.hidden_dim();
        Ok(Self {
            model,
            model_repo: model_repo.to_string(),
            hidden_dim,
        })
    }

    /// Resolve `model_repo` to a directory containing the model files.
    ///
    /// If `model_repo` is an existing local directory, returns it as-is.
    /// Otherwise downloads via `hf-hub` and returns the cache directory.
    fn resolve_model_dir(model_repo: &str) -> crate::Result<PathBuf> {
        let local = Path::new(model_repo);
        if local.is_dir() {
            return Ok(local.to_path_buf());
        }

        // HuggingFace repo path. Download the three required files and
        // return the directory `hf-hub` cached them into. All files
        // land in the same snapshot directory.
        let api = Api::new().map_err(|e| crate::Error::Download(e.to_string()))?;
        let repo = api.model(model_repo.to_string());
        let _ = repo
            .get("config.json")
            .map_err(|e| crate::Error::Download(e.to_string()))?;
        let _ = repo
            .get("tokenizer.json")
            .map_err(|e| crate::Error::Download(e.to_string()))?;
        let weights_path = repo
            .get("model.safetensors")
            .map_err(|e| crate::Error::Download(e.to_string()))?;
        // hf-hub returns the file path; the snapshot directory is its parent.
        weights_path
            .parent()
            .map(std::path::Path::to_path_buf)
            .ok_or_else(|| {
                crate::Error::Other(anyhow::anyhow!(
                    "hf-hub returned root path for {model_repo}; cannot resolve snapshot dir"
                ))
            })
    }

    /// Chunk + embed an explicit list of files, skipping the walk.
    ///
    /// Used by [`RipvecIndex::apply_diff`](crate::encoder::ripvec::index::RipvecIndex::apply_diff)
    /// to incrementally re-embed just the files that changed since the
    /// last reconcile. `root` is the corpus root the paths are
    /// relative to (used for the chunker's `rel_path` field, matching
    /// what [`VectorEncoder::embed_root`] writes for unchanged files).
    ///
    /// Returns `(chunks, embeddings)` in flat lists; ordering mirrors
    /// the per-file traversal order of `paths`. Files that fail to
    /// read or chunk are silently skipped (same policy as
    /// [`chunk_one_file`]).
    ///
    /// # Why a separate method
    ///
    /// [`VectorEncoder::embed_root`] is a heavy three-stage pipeline
    /// optimized for full-corpus builds (thousands of files). For the
    /// "1-50 files changed" case that drives reconciliation, the
    /// sequential single-batch path here is simpler and faster: no
    /// rayon pool spin-up, no bounded channels, no inter-stage
    /// hand-off cost. The batch encode is a single [`encode_batch`]
    /// call.
    ///
    /// # Errors
    ///
    /// Returns the underlying error if `encode_batch` fails.
    pub fn embed_paths(
        &self,
        root: &Path,
        paths: &[std::path::PathBuf],
        profiler: &Profiler,
    ) -> crate::Result<(Vec<CodeChunk>, Vec<Vec<f32>>)> {
        let _guard = profiler.phase("embed_paths");
        let mut chunks_out: Vec<CodeChunk> = Vec::new();
        let mut texts: Vec<String> = Vec::new();
        for path in paths {
            let (file_chunks, file_texts) = chunk_one_file(root, path);
            chunks_out.extend(file_chunks);
            texts.extend(file_texts);
        }
        if chunks_out.is_empty() {
            return Ok((Vec::new(), Vec::new()));
        }
        let text_refs: Vec<&str> = texts.iter().map(String::as_str).collect();
        let embeddings = self.model.encode_batch(&text_refs);
        debug_assert_eq!(embeddings.len(), chunks_out.len());
        Ok((chunks_out, embeddings))
    }
}

impl VectorEncoder for StaticEncoder {
    /// Three-stage bounded-queue pipeline:
    ///
    /// 1. **Chunk producer** — rayon `par_iter` over the file list. Each
    ///    file is read, parsed by tree-sitter (or line-merged on
    ///    fallback), and emitted as `(CodeChunk, String)` pairs into a
    ///    bounded channel of capacity `PIPELINE_BATCH_SIZE * 8`.
    /// 2. **Batch accumulator** — a single scoped thread drains the
    ///    chunk channel, packs `PIPELINE_BATCH_SIZE` pairs per batch,
    ///    and forwards into a bounded channel of capacity
    ///    `PIPELINE_RING_SIZE`.
    /// 3. **Encode worker** — a single scoped thread receives batches
    ///    and calls `StaticEmbedModel::encode_batch`, whose internal
    ///    `par_iter` lights up rayon for the pool_ids kernel.
    ///
    /// Why this shape:
    ///
    /// - The previous "chunk all, then embed all" implementation held
    ///   the entire `Vec<String>` of chunk contents in memory between
    ///   phases. On the linux corpus that was ~400 MB peak. The
    ///   bounded queues cap in-flight memory at
    ///   `PIPELINE_BATCH_SIZE * 8 + PIPELINE_RING_SIZE * PIPELINE_BATCH_SIZE`
    ///   chunks regardless of corpus size — under 15 MB.
    /// - The chunk phase (13s on linux) is hidden inside the embed
    ///   phase (70s) instead of serializing before it. Pre-pipeline
    ///   profile showed user-time at 394s on 82s wall = 4.8x
    ///   parallelism on 12 cores; pipeline lets idle cores chew on
    ///   chunking while embed runs.
    /// - Mirrors `embed::embed_all_streaming`'s shape so the two
    ///   pipelines (BERT + semble) share architectural conventions.
    fn embed_root(
        &self,
        root: &Path,
        cfg: &SearchConfig,
        profiler: &Profiler,
    ) -> crate::Result<(Vec<CodeChunk>, Vec<Vec<f32>>)> {
        // Phase 1: walk (still serial-to-pipeline because we need the
        // full file list to par_iter over; the walk itself is rayon).
        let walk_options = cfg.walk_options();
        let file_paths = {
            let _guard = profiler.phase("walk");
            collect_files_with_options(root, &walk_options)
        };
        if file_paths.is_empty() {
            return Ok((Vec::new(), Vec::new()));
        }

        // Bounded channels. See module constants for the rationale on
        // PIPELINE_BATCH_SIZE and PIPELINE_RING_SIZE.
        let (chunk_tx, chunk_rx) = bounded::<(CodeChunk, String)>(PIPELINE_BATCH_SIZE * 8);
        let (batch_tx, batch_rx) = bounded::<Vec<(CodeChunk, String)>>(PIPELINE_RING_SIZE);

        // The encoder stage writes ordered output behind a Mutex. Order
        // across files isn't meaningful (RipvecIndex doesn't rely on
        // chunk order), only the chunk[i] <-> embedding[i] pairing
        // matters — which we preserve trivially by pushing in lockstep.
        let output: Mutex<Vec<(CodeChunk, Vec<f32>)>> = Mutex::new(Vec::new());
        let model = &self.model;

        // Stage 1 runs on a DEDICATED rayon thread pool. If we used
        // the global pool, Stage 1's par_iter workers would park on
        // full `chunk_tx.send()` calls, and Stage 3's
        // `encode_batch` → `pool_ids` par_iter would have no rayon
        // workers available (they're all parked). That's a classic
        // nested-rayon deadlock — observed in profiling as PID stuck
        // at 0% CPU with 16 parked threads.
        //
        // Half the cores for chunking, half remain in the global pool
        // for the encode worker's pool_ids. The chunk phase (tree-
        // sitter + I/O bound) doesn't need full parallelism to
        // pipeline cleanly behind embed.
        let num_cores = rayon::current_num_threads().max(2);
        let chunk_threads = (num_cores / 2).max(1);
        let chunk_pool = rayon::ThreadPoolBuilder::new()
            .num_threads(chunk_threads)
            .thread_name(|i| format!("semble-chunk-{i}"))
            .build()
            .map_err(|e| crate::Error::Other(anyhow::anyhow!("chunk thread pool build: {e}")))?;

        let _phase_guard = profiler.phase("pipeline");
        std::thread::scope(|scope| {
            // Stage 1: chunk producer on the dedicated pool.
            let chunk_tx_owned = chunk_tx;
            scope.spawn(move || {
                chunk_pool.install(|| {
                    file_paths.par_iter().for_each(|full| {
                        let (chunks, contents) = chunk_one_file(root, full);
                        for (chunk, content) in chunks.into_iter().zip(contents) {
                            if chunk_tx_owned.send((chunk, content)).is_err() {
                                return;
                            }
                        }
                    });
                });
                // chunk_tx_owned drops here, closing the channel.
            });

            // Stage 2: batch accumulator.
            let batch_tx_owned = batch_tx;
            scope.spawn(move || {
                let mut buf: Vec<(CodeChunk, String)> = Vec::with_capacity(PIPELINE_BATCH_SIZE);
                for pair in chunk_rx {
                    buf.push(pair);
                    if buf.len() >= PIPELINE_BATCH_SIZE {
                        let batch =
                            std::mem::replace(&mut buf, Vec::with_capacity(PIPELINE_BATCH_SIZE));
                        if batch_tx_owned.send(batch).is_err() {
                            return;
                        }
                    }
                }
                if !buf.is_empty() {
                    let _ = batch_tx_owned.send(buf);
                }
                // batch_tx_owned drops here, closing the channel.
            });

            // Stage 3: encode worker.
            scope.spawn(|| {
                for batch in batch_rx {
                    if batch.is_empty() {
                        continue;
                    }
                    let mut chunks = Vec::with_capacity(batch.len());
                    let mut texts: Vec<String> = Vec::with_capacity(batch.len());
                    for (chunk, text) in batch {
                        chunks.push(chunk);
                        texts.push(text);
                    }
                    let text_refs: Vec<&str> = texts.iter().map(String::as_str).collect();
                    let embeddings = model.encode_batch(&text_refs);
                    debug_assert_eq!(embeddings.len(), chunks.len());
                    let mut out = output.lock().expect("output mutex poisoned");
                    for (chunk, emb) in chunks.into_iter().zip(embeddings) {
                        out.push((chunk, emb));
                    }
                }
            });
        });

        let collected = output.into_inner().expect("output mutex poisoned");
        let mut chunks_out = Vec::with_capacity(collected.len());
        let mut embs_out = Vec::with_capacity(collected.len());
        for (chunk, emb) in collected {
            chunks_out.push(chunk);
            embs_out.push(emb);
        }
        Ok((chunks_out, embs_out))
    }

    fn hidden_dim(&self) -> usize {
        self.hidden_dim
    }

    fn identity(&self) -> &str {
        &self.model_repo
    }
}

/// A resolved symbol capture: name text, its byte span, and the LSP SymbolKind
/// of its enclosing definition node.
///
/// Produced by [`extract_name_captures`] from a single query match that has
/// both a `@name` and a `@def` capture.
struct NameCapture {
    /// Byte offset of the `@name` node's start within the source.
    start_byte: usize,
    /// Byte offset one past the `@name` node's end.
    end_byte: usize,
    /// Identifier text extracted from the `@name` capture.
    name: String,
    /// LSP SymbolKind derived from the `@def` node's tree-sitter node kind.
    lsp_kind: u32,
}

/// Extract `@name` + `@def` capture pairs from a tree-sitter parse of `source`
/// using the language config's compiled query.
///
/// Returns a list of [`NameCapture`] for every match that has both a `@name`
/// and a `@def` capture.  The list is sorted by `start_byte` so callers can do
/// a linear scan per chunk boundary.
///
/// Performs exactly one parse and one query execution per `chunk_one_file`
/// call — O(1) parses regardless of the number of chunks.
fn extract_name_captures(
    source: &str,
    lang_cfg: &crate::languages::LangConfig,
) -> Vec<NameCapture> {
    let mut parser = Parser::new();
    if parser.set_language(&lang_cfg.language).is_err() {
        return Vec::new();
    }
    let Some(tree) = parser.parse(source, None) else {
        return Vec::new();
    };
    let mut cursor = QueryCursor::new();
    let mut matches = cursor.matches(&lang_cfg.query, tree.root_node(), source.as_bytes());
    let capture_names = lang_cfg.query.capture_names();
    let mut result: Vec<NameCapture> = Vec::new();
    while let Some(m) = matches.next() {
        // Collect @name and @def from this match.
        let mut name_start = 0usize;
        let mut name_end = 0usize;
        let mut name_text = String::new();
        let mut def_kind = "";
        let mut has_name = false;
        let mut has_def = false;

        for cap in m.captures {
            let cap_name = &capture_names[cap.index as usize];
            if *cap_name == "name" {
                let start = cap.node.start_byte();
                let end = cap.node.end_byte();
                if end <= source.len() {
                    name_start = start;
                    name_end = end;
                    name_text = source[start..end].to_string();
                    has_name = true;
                }
            } else if *cap_name == "def" {
                def_kind = cap.node.kind();
                has_def = true;
            }
        }

        if has_name {
            result.push(NameCapture {
                start_byte: name_start,
                end_byte: name_end,
                name: name_text,
                // If there's no @def capture, fall back to Variable (pre-B1 default).
                lsp_kind: if has_def {
                    lsp_symbol_kind_for_node_kind(def_kind)
                } else {
                    crate::languages::lsp_symbol_kind::VARIABLE
                },
            });
        }
    }
    // Sort by byte position so we can scan linearly per boundary.
    result.sort_unstable_by_key(|c| c.start_byte);
    result
}

/// Find the best name and LSP SymbolKind for a chunk covering
/// `[chunk_start, chunk_end)` bytes.
///
/// "Best" = the first [`NameCapture`] whose `start_byte` falls inside the
/// chunk's byte range. Returns `("", VARIABLE)` if none found (graceful
/// fallback preserving pre-B1 default kind).
fn name_for_chunk(captures: &[NameCapture], chunk_start: usize, chunk_end: usize) -> (&str, u32) {
    for cap in captures {
        if cap.start_byte >= chunk_start && cap.end_byte <= chunk_end {
            return (cap.name.as_str(), cap.lsp_kind);
        }
        // Since captures are sorted by start byte, once we pass chunk_end
        // there can be no more candidates.
        if cap.start_byte >= chunk_end {
            break;
        }
    }
    ("", crate::languages::lsp_symbol_kind::VARIABLE)
}

/// Chunk one file. Returns `(file_chunks, file_contents)` — empty
/// when the file is too large, can't be read, or has no chunks.
fn chunk_one_file(root: &Path, full: &Path) -> (Vec<CodeChunk>, Vec<String>) {
    match std::fs::metadata(full) {
        Ok(meta) if meta.len() > MAX_FILE_BYTES => return (Vec::new(), Vec::new()),
        Err(_) => return (Vec::new(), Vec::new()),
        _ => {}
    }
    let Ok(source) = std::fs::read_to_string(full) else {
        return (Vec::new(), Vec::new());
    };

    let ext = full
        .extension()
        .and_then(|e| e.to_str())
        .unwrap_or_default();
    let lang_cfg = config_for_extension(ext);
    let language = lang_cfg.as_ref().map(|c| &c.language);

    // Parse once per file to collect all `@name` + `@def` captures for name
    // and kind population.  Falls back to an empty list when there is no
    // language config or the parse fails — chunk names remain "" and kind
    // falls back to Variable.
    let name_captures: Vec<NameCapture> = lang_cfg
        .as_deref()
        .map(|cfg| extract_name_captures(&source, cfg))
        .unwrap_or_default();

    let rel_path = full
        .strip_prefix(root)
        .unwrap_or(full)
        .display()
        .to_string();

    let content_kind = ContentKind::from_extension(ext);
    let boundaries = chunk_source(&source, language, DEFAULT_DESIRED_CHUNK_CHARS);
    let mut chunks = Vec::with_capacity(boundaries.len());
    let mut contents = Vec::with_capacity(boundaries.len());
    for b in boundaries {
        let text = b.content(&source).to_string();
        if text.trim().is_empty() {
            continue;
        }
        let (name, lsp_kind) = name_for_chunk(&name_captures, b.start_byte, b.end_byte);
        let name = name.to_string();
        // Store the LSP SymbolKind as a decimal string so downstream consumers
        // (e.g., ripvec-mcp's lsp_workspace_symbols) can parse it directly
        // without re-running the mapping table. Empty string is preserved for
        // chunks without a recognised definition (consistent with pre-B2 behaviour).
        let kind = if name.is_empty() {
            String::new()
        } else {
            lsp_kind.to_string()
        };
        contents.push(text.clone());
        chunks.push(CodeChunk {
            file_path: rel_path.clone(),
            name,
            kind,
            content_kind,
            start_line: b.start_line,
            end_line: b.end_line,
            // Dense/AST-merge path does not track the identifier line separately;
            // fall back to start_line per CodeChunk.symbol_line documentation.
            symbol_line: b.start_line,
            content: text.clone(),
            enriched_content: text,
            qualified_name: None,
        });
    }
    (chunks, contents)
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::encoder::VectorEncoder;
    use std::io::Write as _;

    /// `test:chunk_one_file_populates_name_from_tree_sitter` — chunk_one_file
    /// must populate `name` from tree-sitter when the source contains a
    /// recognisable definition.
    #[test]
    fn chunk_one_file_populates_name_from_tree_sitter() {
        let source = "pub fn add(a: i32, b: i32) -> i32 { a + b }\n";
        let dir = tempfile::tempdir().expect("tempdir");
        let path = dir.path().join("add.rs");
        {
            let mut f = std::fs::File::create(&path).expect("create");
            f.write_all(source.as_bytes()).expect("write");
        }
        let (chunks, _) = chunk_one_file(dir.path(), &path);
        assert!(
            !chunks.is_empty(),
            "expected at least one chunk from Rust source"
        );
        assert!(
            chunks.iter().any(|c| c.name == "add"),
            "expected at least one chunk with name 'add'; got names: {:?}",
            chunks.iter().map(|c| c.name.as_str()).collect::<Vec<_>>()
        );
    }

    /// `test:chunk_one_file_leaves_name_empty_when_no_identifier` — when the
    /// source has no tree-sitter-recognisable definitions, name stays empty.
    #[test]
    fn chunk_one_file_leaves_name_empty_when_no_identifier() {
        // Only whitespace and comments — no function/struct/enum definitions.
        let source = "// just a comment\n   \n// another comment\n";
        let dir = tempfile::tempdir().expect("tempdir");
        let path = dir.path().join("comments.rs");
        {
            let mut f = std::fs::File::create(&path).expect("create");
            f.write_all(source.as_bytes()).expect("write");
        }
        let (chunks, _) = chunk_one_file(dir.path(), &path);
        // Either no chunks at all, or all chunks have an empty name.
        for c in &chunks {
            assert!(
                c.name.is_empty(),
                "expected empty name for comment-only source; got {:?}",
                c.name
            );
        }
    }

    /// `StaticEncoder` implements `VectorEncoder` + Send + Sync.
    /// Compile-time check (`test:static-encoder-implements-vector-encoder`).
    #[test]
    fn static_encoder_implements_vector_encoder() {
        fn assert_trait_object<T: VectorEncoder + Send + Sync>() {}
        assert_trait_object::<StaticEncoder>();
    }

    // -------------------------------------------------------------------------
    // B2: chunk_one_file kind-tagging tests
    // -------------------------------------------------------------------------

    /// Helper: write a temp file and return `(dir, path)`.
    fn write_temp(source: &str, filename: &str) -> (tempfile::TempDir, std::path::PathBuf) {
        let dir = tempfile::tempdir().expect("tempdir");
        let path = dir.path().join(filename);
        std::fs::write(&path, source).expect("write");
        (dir, path)
    }

    /// `test:chunk_one_file_populates_kind_for_rust_struct` — `chunk_one_file`
    /// emits a chunk whose `kind` is `"23"` (LSP Struct) for a `pub struct`.
    ///
    /// Behavior: trigger-fails-on-baseline-then-passes-post-fix.
    /// On the baseline, `kind` was always `""` (empty string from the semble
    /// chunker), so this test fails. Post-B2 the kind is the LSP numeric string.
    #[test]
    fn chunk_one_file_populates_kind_for_rust_struct() {
        let source = "pub struct Foo { x: i32 }\n";
        let (dir, path) = write_temp(source, "foo.rs");
        let (chunks, _) = chunk_one_file(dir.path(), &path);
        let struct_chunk = chunks.iter().find(|c| c.name == "Foo");
        assert!(
            struct_chunk.is_some(),
            "expected a chunk named 'Foo'; got: {:?}",
            chunks.iter().map(|c| c.name.as_str()).collect::<Vec<_>>()
        );
        let kind = &struct_chunk.unwrap().kind;
        assert_eq!(
            kind.as_str(),
            "23",
            "struct_item must emit LSP SymbolKind::Struct (23); got: {kind:?}"
        );
    }

    /// `test:chunk_one_file_populates_kind_for_rust_trait` — `chunk_one_file`
    /// emits a chunk whose `kind` is `"11"` (LSP Interface) for a trait.
    #[test]
    fn chunk_one_file_populates_kind_for_rust_trait() {
        let source = "pub trait MyTrait { fn method(&self); }\n";
        let (dir, path) = write_temp(source, "trait.rs");
        let (chunks, _) = chunk_one_file(dir.path(), &path);
        let trait_chunk = chunks.iter().find(|c| c.name == "MyTrait");
        assert!(
            trait_chunk.is_some(),
            "expected a chunk named 'MyTrait'; got: {:?}",
            chunks.iter().map(|c| c.name.as_str()).collect::<Vec<_>>()
        );
        let kind = &trait_chunk.unwrap().kind;
        assert_eq!(
            kind.as_str(),
            "11",
            "trait_item must emit LSP SymbolKind::Interface (11); got: {kind:?}"
        );
    }

    /// `test:chunk_one_file_kind_distinct_from_variable_default` — after B2,
    /// named chunks must not carry the old hardcoded `""` (empty) kind.
    ///
    /// Pre-B2 all chunks from the semble AST-merge path had `kind: String::new()`
    /// (= `""`). This test ensures that chunks whose name is non-empty carry a
    /// meaningful, non-empty LSP kind string.
    ///
    /// Note: The semble AST-merge chunker packs adjacent small definitions into a
    /// single chunk and assigns only the FIRST capture's name. The kind test
    /// therefore validates the overall invariant — named chunks have non-empty
    /// kinds — rather than testing each definition independently (which requires
    /// definitions large enough to occupy distinct chunks).
    #[test]
    fn chunk_one_file_kind_distinct_from_variable_default() {
        // Use a file with a single, definitively-named struct so the chunk
        // carries a meaningful kind. The semble chunker will emit one chunk
        // with name "Qux" and kind "23" (Struct).
        let source = "pub struct Qux { x: i32, y: i32 }\n";
        let (dir, path) = write_temp(source, "qux.rs");
        let (chunks, _) = chunk_one_file(dir.path(), &path);

        // Find the named chunk.
        let named_chunks: Vec<_> = chunks.iter().filter(|c| !c.name.is_empty()).collect();
        assert!(
            !named_chunks.is_empty(),
            "expected at least one named chunk from Rust source with struct definition"
        );

        // Every named chunk must have a non-empty kind (pre-B2 regression: kind was "").
        for c in &named_chunks {
            assert!(
                !c.kind.is_empty(),
                "named chunk '{}' must have non-empty kind (pre-B2 regression); got empty",
                c.name
            );
        }

        // The struct chunk specifically must have kind "23" (LSP Struct).
        let qux = named_chunks.iter().find(|c| c.name == "Qux");
        if let Some(c) = qux {
            assert_eq!(
                c.kind.as_str(),
                "23",
                "Qux (struct_item) must emit LSP SymbolKind::Struct (23); got: {:?}",
                c.kind
            );
        }
    }

    /// `from_pretrained` returns the right hidden_dim from a probe encode.
    /// Ignored by default because it requires a model download (~16 MB).
    ///
    /// Corresponds to acceptance `test:static-encoder-hidden-dim-256` and
    /// `test:static-encoder-loads-potion-code-16m` and
    /// `test:static-encoder-output-is-l2-normalized`.
    #[test]
    #[ignore = "requires local model files at RIPVEC_SEMBLE_MODEL_PATH"]
    fn static_encoder_loads_potion_code_16m() {
        let Ok(path) = std::env::var("RIPVEC_SEMBLE_MODEL_PATH") else {
            eprintln!("RIPVEC_SEMBLE_MODEL_PATH not set; skipping");
            return;
        };
        let enc = StaticEncoder::from_pretrained(&path).expect("model load should succeed");
        assert_eq!(enc.hidden_dim(), DEFAULT_HIDDEN_DIM);
        // identity() reflects what the caller passed (typically the
        // local path under test).
        assert_eq!(enc.identity(), path);

        // Verify L2-normalized output via the public encode_query path.
        let row = enc.encode_query("hello world");
        let norm: f32 = row.iter().map(|x| x * x).sum::<f32>().sqrt();
        assert!(
            (norm - 1.0).abs() < 1e-3,
            "expected L2-normalized output; got norm={norm}"
        );
    }
}