ripvec_core/encoder/ripvec/dense.rs
1//! Static encoder: in-process `StaticEmbedModel` reimplementation.
2//!
3//! Port of `~/src/semble/src/semble/index/dense.py`. Wraps
4//! [`StaticEmbedModel`] loaded with `minishlab/potion-base-32M`
5//! (256-dim, L2-normalized). Implements [`VectorEncoder`] for the
6//! `--model ripvec` path. CPU-only; no batching ring buffer.
7//!
8//! Default was bumped to `potion-base-32M` in v1.3.0 after the
9//! gutenberg + python-repos matrix showed 32M winning prose by
10//! 0.058 NDCG@10 while losing code by only 0.004 — a clear
11//! single-default win once the i64 mapping bug and the reranker
12//! pooler / sigmoid / truncation bugs were fixed. The code-tuned
13//! `potion-code-16M` is still available via `--model-repo`.
14//!
15//! ## Why not `model2vec-rs`?
16//!
17//! The previous wave used the upstream `model2vec-rs` crate. Two real
18//! problems pushed us to reimplement (see
19//! `crates/ripvec-core/src/encoder/semble/static_model.rs` for the
20//! full design rationale):
21//!
22//! 1. `model2vec_rs::StaticModel::encode_with_args` runs `pool_ids`
23//! in a serial inner loop while `tokenizers::encode_batch_fast`
24//! spawns its own rayon pool. Wrapping that path in our outer
25//! `par_chunks` produced 60% `__psynch_cvwait` in the linux-corpus
26//! profile — nested rayon scopes parking on each other. The
27//! reimplementation does ONE big tokenize plus a `par_iter` over
28//! `pool_ids` — no nested rayon, no parking.
29//! 2. `model2vec-rs 0.2` pinned `ndarray 0.15`; ripvec-core uses
30//! `ndarray 0.17`. The two `Array2<f32>` types were not
31//! interchangeable, forcing a `Vec<Vec<f32>>` shim. Owning the
32//! load path eliminates the mismatch.
33
34use std::path::{Path, PathBuf};
35use std::sync::Mutex;
36
37use crossbeam_channel::bounded;
38use hf_hub::api::sync::Api;
39use rayon::prelude::*;
40
41use streaming_iterator::StreamingIterator;
42use tree_sitter::{Parser, QueryCursor};
43
44use crate::chunk::{CodeChunk, ContentKind};
45use crate::embed::SearchConfig;
46use crate::encoder::VectorEncoder;
47use crate::encoder::ripvec::chunking::{DEFAULT_DESIRED_CHUNK_CHARS, chunk_source};
48use crate::encoder::ripvec::static_model::StaticEmbedModel;
49use crate::languages::config_for_extension;
50use crate::profile::Profiler;
51use crate::walk::collect_files_with_options;
52
53/// Encode batch size used by the streaming pipeline. Matches
54/// `StaticEmbedModel`'s internal `BATCH_SIZE` so each emitted batch
55/// is exactly one `encode_batch_fast` call's worth of work.
56const PIPELINE_BATCH_SIZE: usize = 1024;
57
58/// Number of full batches allowed in-flight from chunker to encoder.
59/// Provides enough pipeline depth for the encoder to stay busy while
60/// the chunker fills the next batch; small enough that peak memory
61/// stays bounded.
62const PIPELINE_RING_SIZE: usize = 4;
63
64/// Default model repo identifier for the ripvec path. This is the HF
65/// repo string used as `identity()`; the loader reads files from a
66/// local path passed via `--model-repo`.
67pub const DEFAULT_MODEL_REPO: &str = "minishlab/potion-base-32M";
68
69/// Default hidden dimension for [`DEFAULT_MODEL_REPO`].
70pub const DEFAULT_HIDDEN_DIM: usize = 256;
71
72/// Maximum source file size to read, in bytes (mirrors semble's
73/// `_MAX_FILE_BYTES = 1_000_000` from `index/create.py:16`).
74const MAX_FILE_BYTES: u64 = 1_000_000;
75
76/// CPU-only static encoder.
77///
78/// Owns a loaded [`StaticEmbedModel`] plus identity metadata. The
79/// embedder is constructed by `main.rs::load_pipeline` via
80/// [`StaticEncoder::from_pretrained`], passing either a local path
81/// containing the Model2Vec files or (planned) an HF repo ID.
82pub struct StaticEncoder {
83 model: StaticEmbedModel,
84 model_repo: String,
85 hidden_dim: usize,
86}
87
88impl StaticEncoder {
89 /// Encode a query string into a single embedding row.
90 ///
91 /// Used by `RipvecIndex::search` for hybrid/semantic dispatch.
92 #[must_use]
93 pub fn encode_query(&self, query: &str) -> Vec<f32> {
94 self.model.encode_query(query)
95 }
96
97 /// Load a model by HuggingFace repo ID or local path.
98 ///
99 /// Two acceptance shapes:
100 ///
101 /// 1. **Local path** — if `model_repo` names an existing directory,
102 /// load directly from it. Used by the parity test fixture path
103 /// (`/tmp/potion-base-32M`) and any user pre-staging files.
104 /// 2. **HuggingFace repo ID** — otherwise treat as `org/repo`,
105 /// download `config.json` / `tokenizer.json` / `model.safetensors`
106 /// via `hf-hub` into `~/.cache/huggingface/hub/`, and load from
107 /// there. Matches `load_classic_cpu` / `load_modernbert_cpu`'s
108 /// behaviour so the user-facing API is consistent: bare `--model
109 /// ripvec` with no `--model-repo` flag works.
110 ///
111 /// # Errors
112 ///
113 /// Propagates the underlying I/O, download, or parse error if the
114 /// files cannot be obtained or the safetensors layout is
115 /// unrecognized.
116 pub fn from_pretrained(model_repo: &str) -> crate::Result<Self> {
117 let resolved = Self::resolve_model_dir(model_repo)?;
118 let model = StaticEmbedModel::from_path(&resolved, Some(true))
119 .map_err(|e| crate::Error::Other(anyhow::anyhow!("static model load failed: {e}")))?;
120 let hidden_dim = model.hidden_dim();
121 Ok(Self {
122 model,
123 model_repo: model_repo.to_string(),
124 hidden_dim,
125 })
126 }
127
128 /// Resolve `model_repo` to a directory containing the model files.
129 ///
130 /// If `model_repo` is an existing local directory, returns it as-is.
131 /// Otherwise downloads via `hf-hub` and returns the cache directory.
132 fn resolve_model_dir(model_repo: &str) -> crate::Result<PathBuf> {
133 let local = Path::new(model_repo);
134 if local.is_dir() {
135 return Ok(local.to_path_buf());
136 }
137
138 // HuggingFace repo path. Download the three required files and
139 // return the directory `hf-hub` cached them into. All files
140 // land in the same snapshot directory.
141 let api = Api::new().map_err(|e| crate::Error::Download(e.to_string()))?;
142 let repo = api.model(model_repo.to_string());
143 let _ = repo
144 .get("config.json")
145 .map_err(|e| crate::Error::Download(e.to_string()))?;
146 let _ = repo
147 .get("tokenizer.json")
148 .map_err(|e| crate::Error::Download(e.to_string()))?;
149 let weights_path = repo
150 .get("model.safetensors")
151 .map_err(|e| crate::Error::Download(e.to_string()))?;
152 // hf-hub returns the file path; the snapshot directory is its parent.
153 weights_path
154 .parent()
155 .map(std::path::Path::to_path_buf)
156 .ok_or_else(|| {
157 crate::Error::Other(anyhow::anyhow!(
158 "hf-hub returned root path for {model_repo}; cannot resolve snapshot dir"
159 ))
160 })
161 }
162
163 /// Chunk + embed an explicit list of files, skipping the walk.
164 ///
165 /// Used by [`RipvecIndex::apply_diff`](crate::encoder::ripvec::index::RipvecIndex::apply_diff)
166 /// to incrementally re-embed just the files that changed since the
167 /// last reconcile. `root` is the corpus root the paths are
168 /// relative to (used for the chunker's `rel_path` field, matching
169 /// what [`VectorEncoder::embed_root`] writes for unchanged files).
170 ///
171 /// Returns `(chunks, embeddings)` in flat lists; ordering mirrors
172 /// the per-file traversal order of `paths`. Files that fail to
173 /// read or chunk are silently skipped (same policy as
174 /// [`chunk_one_file`]).
175 ///
176 /// # Why a separate method
177 ///
178 /// [`VectorEncoder::embed_root`] is a heavy three-stage pipeline
179 /// optimized for full-corpus builds (thousands of files). For the
180 /// "1-50 files changed" case that drives reconciliation, the
181 /// sequential single-batch path here is simpler and faster: no
182 /// rayon pool spin-up, no bounded channels, no inter-stage
183 /// hand-off cost. The batch encode is a single [`encode_batch`]
184 /// call.
185 ///
186 /// # Errors
187 ///
188 /// Returns the underlying error if `encode_batch` fails.
189 pub fn embed_paths(
190 &self,
191 root: &Path,
192 paths: &[std::path::PathBuf],
193 profiler: &Profiler,
194 ) -> crate::Result<(Vec<CodeChunk>, Vec<Vec<f32>>)> {
195 let _guard = profiler.phase("embed_paths");
196 let mut chunks_out: Vec<CodeChunk> = Vec::new();
197 let mut texts: Vec<String> = Vec::new();
198 for path in paths {
199 let (file_chunks, file_texts) = chunk_one_file(root, path);
200 chunks_out.extend(file_chunks);
201 texts.extend(file_texts);
202 }
203 if chunks_out.is_empty() {
204 return Ok((Vec::new(), Vec::new()));
205 }
206 let text_refs: Vec<&str> = texts.iter().map(String::as_str).collect();
207 let embeddings = self.model.encode_batch(&text_refs);
208 debug_assert_eq!(embeddings.len(), chunks_out.len());
209 Ok((chunks_out, embeddings))
210 }
211}
212
213impl VectorEncoder for StaticEncoder {
214 /// Three-stage bounded-queue pipeline:
215 ///
216 /// 1. **Chunk producer** — rayon `par_iter` over the file list. Each
217 /// file is read, parsed by tree-sitter (or line-merged on
218 /// fallback), and emitted as `(CodeChunk, String)` pairs into a
219 /// bounded channel of capacity `PIPELINE_BATCH_SIZE * 8`.
220 /// 2. **Batch accumulator** — a single scoped thread drains the
221 /// chunk channel, packs `PIPELINE_BATCH_SIZE` pairs per batch,
222 /// and forwards into a bounded channel of capacity
223 /// `PIPELINE_RING_SIZE`.
224 /// 3. **Encode worker** — a single scoped thread receives batches
225 /// and calls `StaticEmbedModel::encode_batch`, whose internal
226 /// `par_iter` lights up rayon for the pool_ids kernel.
227 ///
228 /// Why this shape:
229 ///
230 /// - The previous "chunk all, then embed all" implementation held
231 /// the entire `Vec<String>` of chunk contents in memory between
232 /// phases. On the linux corpus that was ~400 MB peak. The
233 /// bounded queues cap in-flight memory at
234 /// `PIPELINE_BATCH_SIZE * 8 + PIPELINE_RING_SIZE * PIPELINE_BATCH_SIZE`
235 /// chunks regardless of corpus size — under 15 MB.
236 /// - The chunk phase (13s on linux) is hidden inside the embed
237 /// phase (70s) instead of serializing before it. Pre-pipeline
238 /// profile showed user-time at 394s on 82s wall = 4.8x
239 /// parallelism on 12 cores; pipeline lets idle cores chew on
240 /// chunking while embed runs.
241 /// - Mirrors `embed::embed_all_streaming`'s shape so the two
242 /// pipelines (BERT + semble) share architectural conventions.
243 fn embed_root(
244 &self,
245 root: &Path,
246 cfg: &SearchConfig,
247 profiler: &Profiler,
248 ) -> crate::Result<(Vec<CodeChunk>, Vec<Vec<f32>>)> {
249 // Phase 1: walk (still serial-to-pipeline because we need the
250 // full file list to par_iter over; the walk itself is rayon).
251 let walk_options = cfg.walk_options();
252 let file_paths = {
253 let _guard = profiler.phase("walk");
254 collect_files_with_options(root, &walk_options)
255 };
256 if file_paths.is_empty() {
257 return Ok((Vec::new(), Vec::new()));
258 }
259
260 // Bounded channels. See module constants for the rationale on
261 // PIPELINE_BATCH_SIZE and PIPELINE_RING_SIZE.
262 let (chunk_tx, chunk_rx) = bounded::<(CodeChunk, String)>(PIPELINE_BATCH_SIZE * 8);
263 let (batch_tx, batch_rx) = bounded::<Vec<(CodeChunk, String)>>(PIPELINE_RING_SIZE);
264
265 // The encoder stage writes ordered output behind a Mutex. Order
266 // across files isn't meaningful (RipvecIndex doesn't rely on
267 // chunk order), only the chunk[i] <-> embedding[i] pairing
268 // matters — which we preserve trivially by pushing in lockstep.
269 let output: Mutex<Vec<(CodeChunk, Vec<f32>)>> = Mutex::new(Vec::new());
270 let model = &self.model;
271
272 // Stage 1 runs on a DEDICATED rayon thread pool. If we used
273 // the global pool, Stage 1's par_iter workers would park on
274 // full `chunk_tx.send()` calls, and Stage 3's
275 // `encode_batch` → `pool_ids` par_iter would have no rayon
276 // workers available (they're all parked). That's a classic
277 // nested-rayon deadlock — observed in profiling as PID stuck
278 // at 0% CPU with 16 parked threads.
279 //
280 // Half the cores for chunking, half remain in the global pool
281 // for the encode worker's pool_ids. The chunk phase (tree-
282 // sitter + I/O bound) doesn't need full parallelism to
283 // pipeline cleanly behind embed.
284 let num_cores = rayon::current_num_threads().max(2);
285 let chunk_threads = (num_cores / 2).max(1);
286 let chunk_pool = rayon::ThreadPoolBuilder::new()
287 .num_threads(chunk_threads)
288 .thread_name(|i| format!("semble-chunk-{i}"))
289 .build()
290 .map_err(|e| crate::Error::Other(anyhow::anyhow!("chunk thread pool build: {e}")))?;
291
292 let _phase_guard = profiler.phase("pipeline");
293 std::thread::scope(|scope| {
294 // Stage 1: chunk producer on the dedicated pool.
295 let chunk_tx_owned = chunk_tx;
296 scope.spawn(move || {
297 chunk_pool.install(|| {
298 file_paths.par_iter().for_each(|full| {
299 let (chunks, contents) = chunk_one_file(root, full);
300 for (chunk, content) in chunks.into_iter().zip(contents) {
301 if chunk_tx_owned.send((chunk, content)).is_err() {
302 return;
303 }
304 }
305 });
306 });
307 // chunk_tx_owned drops here, closing the channel.
308 });
309
310 // Stage 2: batch accumulator.
311 let batch_tx_owned = batch_tx;
312 scope.spawn(move || {
313 let mut buf: Vec<(CodeChunk, String)> = Vec::with_capacity(PIPELINE_BATCH_SIZE);
314 for pair in chunk_rx {
315 buf.push(pair);
316 if buf.len() >= PIPELINE_BATCH_SIZE {
317 let batch =
318 std::mem::replace(&mut buf, Vec::with_capacity(PIPELINE_BATCH_SIZE));
319 if batch_tx_owned.send(batch).is_err() {
320 return;
321 }
322 }
323 }
324 if !buf.is_empty() {
325 let _ = batch_tx_owned.send(buf);
326 }
327 // batch_tx_owned drops here, closing the channel.
328 });
329
330 // Stage 3: encode worker.
331 scope.spawn(|| {
332 for batch in batch_rx {
333 if batch.is_empty() {
334 continue;
335 }
336 let mut chunks = Vec::with_capacity(batch.len());
337 let mut texts: Vec<String> = Vec::with_capacity(batch.len());
338 for (chunk, text) in batch {
339 chunks.push(chunk);
340 texts.push(text);
341 }
342 let text_refs: Vec<&str> = texts.iter().map(String::as_str).collect();
343 let embeddings = model.encode_batch(&text_refs);
344 debug_assert_eq!(embeddings.len(), chunks.len());
345 let mut out = output.lock().expect("output mutex poisoned");
346 for (chunk, emb) in chunks.into_iter().zip(embeddings) {
347 out.push((chunk, emb));
348 }
349 }
350 });
351 });
352
353 let collected = output.into_inner().expect("output mutex poisoned");
354 let mut chunks_out = Vec::with_capacity(collected.len());
355 let mut embs_out = Vec::with_capacity(collected.len());
356 for (chunk, emb) in collected {
357 chunks_out.push(chunk);
358 embs_out.push(emb);
359 }
360 Ok((chunks_out, embs_out))
361 }
362
363 fn hidden_dim(&self) -> usize {
364 self.hidden_dim
365 }
366
367 fn identity(&self) -> &str {
368 &self.model_repo
369 }
370}
371
372/// A resolved symbol capture: name text, its byte span, and the LSP SymbolKind
373/// of its enclosing definition node.
374///
375/// Produced by [`extract_name_captures`] from a single query match that has
376/// both a `@name` and a `@def` capture.
377struct NameCapture {
378 /// Byte offset of the `@name` node's start within the source.
379 start_byte: usize,
380 /// Byte offset one past the `@name` node's end.
381 end_byte: usize,
382 /// Identifier text extracted from the `@name` capture.
383 name: String,
384 /// LSP SymbolKind derived from the `@def` node's tree-sitter node kind.
385 lsp_kind: u32,
386 /// Tree-sitter node kind string of the `@def` node (e.g. "block", "attribute",
387 /// "function_definition"). C13W1: needed by the dense pipeline to recognise
388 /// HCL `attribute` captures inside locals blocks and emit them as per-symbol
389 /// micro-chunks (otherwise the AST-merge chunker collapses them into the
390 /// enclosing locals block's chunk and `find_similar(symbol_name=…)` cannot
391 /// resolve the per-local name).
392 def_kind: String,
393 /// Byte span `[start, end)` of the `@def` node. Same C13W1 use as
394 /// [`Self::def_kind`].
395 def_start_byte: usize,
396 def_end_byte: usize,
397}
398
399/// Extract `@name` + `@def` capture pairs from a tree-sitter parse of `source`
400/// using the language config's compiled query.
401///
402/// Returns a list of [`NameCapture`] for every match that has both a `@name`
403/// and a `@def` capture. The list is sorted by `start_byte` so callers can do
404/// a linear scan per chunk boundary.
405///
406/// Performs exactly one parse and one query execution per `chunk_one_file`
407/// call — O(1) parses regardless of the number of chunks.
408fn extract_name_captures(
409 source: &str,
410 lang_cfg: &crate::languages::LangConfig,
411) -> Vec<NameCapture> {
412 let mut parser = Parser::new();
413 if parser.set_language(&lang_cfg.language).is_err() {
414 return Vec::new();
415 }
416 let Some(tree) = parser.parse(source, None) else {
417 return Vec::new();
418 };
419 let mut cursor = QueryCursor::new();
420 let mut matches = cursor.matches(&lang_cfg.query, tree.root_node(), source.as_bytes());
421 let capture_names = lang_cfg.query.capture_names();
422 let mut result: Vec<NameCapture> = Vec::new();
423 while let Some(m) = matches.next() {
424 // Collect @name and @def from this match.
425 let mut name_start = 0usize;
426 let mut name_end = 0usize;
427 let mut name_text = String::new();
428 // Store the @def node to use with lsp_symbol_kind_for_node (C2, 4.1.1).
429 let mut def_node: Option<tree_sitter::Node<'_>> = None;
430 let mut has_name = false;
431
432 for cap in m.captures {
433 let cap_name = &capture_names[cap.index as usize];
434 if *cap_name == "name" {
435 let start = cap.node.start_byte();
436 let end = cap.node.end_byte();
437 if end <= source.len() {
438 name_start = start;
439 name_end = end;
440 name_text = source[start..end].to_string();
441 has_name = true;
442 }
443 } else if *cap_name == "def" {
444 def_node = Some(cap.node);
445 }
446 }
447
448 if has_name {
449 let (def_kind, def_start_byte, def_end_byte) = if let Some(node) = def_node {
450 (node.kind().to_string(), node.start_byte(), node.end_byte())
451 } else {
452 (String::new(), name_start, name_end)
453 };
454 result.push(NameCapture {
455 start_byte: name_start,
456 end_byte: name_end,
457 name: name_text,
458 // C2 (4.1.1): Use the decorator-aware lsp_symbol_kind_for_node when
459 // the @def node is available so Python @classmethod vs @property is
460 // correctly classified. Falls back to Variable when there's no @def.
461 lsp_kind: if let Some(node) = def_node {
462 crate::languages::lsp_symbol_kind_for_node(&node, source.as_bytes())
463 } else {
464 crate::languages::lsp_symbol_kind::VARIABLE
465 },
466 def_kind,
467 def_start_byte,
468 def_end_byte,
469 });
470 }
471 }
472 // Sort by byte position so we can scan linearly per boundary.
473 result.sort_unstable_by_key(|c| c.start_byte);
474 result
475}
476
477/// Find the best name and LSP SymbolKind for a chunk covering
478/// `[chunk_start, chunk_end)` bytes.
479///
480/// "Best" = the first [`NameCapture`] whose `start_byte` falls inside the
481/// chunk's byte range. Returns `("", VARIABLE)` if none found (graceful
482/// fallback preserving pre-B1 default kind).
483fn name_for_chunk(captures: &[NameCapture], chunk_start: usize, chunk_end: usize) -> (&str, u32) {
484 for cap in captures {
485 if cap.start_byte >= chunk_start && cap.end_byte <= chunk_end {
486 return (cap.name.as_str(), cap.lsp_kind);
487 }
488 // Since captures are sorted by start byte, once we pass chunk_end
489 // there can be no more candidates.
490 if cap.start_byte >= chunk_end {
491 break;
492 }
493 }
494 ("", crate::languages::lsp_symbol_kind::VARIABLE)
495}
496
497/// Chunk one file. Returns `(file_chunks, file_contents)` — empty
498/// when the file is too large, can't be read, or has no chunks.
499fn chunk_one_file(root: &Path, full: &Path) -> (Vec<CodeChunk>, Vec<String>) {
500 match std::fs::metadata(full) {
501 Ok(meta) if meta.len() > MAX_FILE_BYTES => return (Vec::new(), Vec::new()),
502 Err(_) => return (Vec::new(), Vec::new()),
503 _ => {}
504 }
505 let Ok(source) = std::fs::read_to_string(full) else {
506 return (Vec::new(), Vec::new());
507 };
508
509 let ext = full
510 .extension()
511 .and_then(|e| e.to_str())
512 .unwrap_or_default();
513 let lang_cfg = config_for_extension(ext);
514 let language = lang_cfg.as_ref().map(|c| &c.language);
515
516 // Parse once per file to collect all `@name` + `@def` captures for name
517 // and kind population. Falls back to an empty list when there is no
518 // language config or the parse fails — chunk names remain "" and kind
519 // falls back to Variable.
520 let name_captures: Vec<NameCapture> = lang_cfg
521 .as_deref()
522 .map(|cfg| extract_name_captures(&source, cfg))
523 .unwrap_or_default();
524
525 let rel_path = full
526 .strip_prefix(root)
527 .unwrap_or(full)
528 .display()
529 .to_string();
530
531 let content_kind = ContentKind::from_extension(ext);
532 let boundaries = chunk_source(&source, language, DEFAULT_DESIRED_CHUNK_CHARS);
533 let mut chunks = Vec::with_capacity(boundaries.len());
534 let mut contents = Vec::with_capacity(boundaries.len());
535 for b in boundaries {
536 let text = b.content(&source).to_string();
537 if text.trim().is_empty() {
538 continue;
539 }
540 let (name, lsp_kind) = name_for_chunk(&name_captures, b.start_byte, b.end_byte);
541 let name = name.to_string();
542 // Store the LSP SymbolKind as a decimal string so downstream consumers
543 // (e.g., ripvec-mcp's lsp_workspace_symbols) can parse it directly
544 // without re-running the mapping table. Empty string is preserved for
545 // chunks without a recognised definition (consistent with pre-B2 behaviour).
546 let kind = if name.is_empty() {
547 String::new()
548 } else {
549 lsp_kind.to_string()
550 };
551 contents.push(text.clone());
552 chunks.push(CodeChunk {
553 file_path: rel_path.clone(),
554 name,
555 kind,
556 content_kind,
557 start_line: b.start_line,
558 end_line: b.end_line,
559 // Dense/AST-merge path does not track the identifier line separately;
560 // fall back to start_line per CodeChunk.symbol_line documentation.
561 symbol_line: b.start_line,
562 content: text.clone(),
563 enriched_content: text,
564 qualified_name: None,
565 });
566 }
567
568 // C13W1: HCL per-locals-attribute micro-chunks for the dense pipeline.
569 //
570 // The AST-merge chunker collapses small attribute siblings into the enclosing
571 // `locals { ... }` chunk, so each attribute's @name capture loses its 1:1
572 // identity in `name_for_chunk` (only the first match per boundary wins).
573 // Splice in additional CodeChunks — one per `attribute` @def capture — so
574 // BM25, `find_similar(symbol_name=…)`, and `lsp_workspace_symbols` can all
575 // resolve `local.X` by its bare name. Mirrors `chunk::emit_hcl_local_attribute_chunks`
576 // shape (kind="local_attribute", qualified_name="local.X").
577 let is_hcl = matches!(ext, "tf" | "tfvars" | "hcl");
578 if is_hcl {
579 for cap in &name_captures {
580 if cap.def_kind != "attribute" {
581 continue;
582 }
583 // Defensive bounds check; the query never emits captures past EOF
584 // but the borrow checker can't prove it for stale parses.
585 if cap.def_end_byte > source.len() || cap.def_start_byte >= cap.def_end_byte {
586 continue;
587 }
588 let attr_text = source[cap.def_start_byte..cap.def_end_byte].to_string();
589 // 1-based line numbers — match CodeChunk convention. Count newlines
590 // BEFORE the def start (so first byte = line 1) and through the def
591 // body (so end-line = start-line + lines in body).
592 let line_at = |byte: usize| -> usize {
593 1 + bytecount::count(&source.as_bytes()[..byte.min(source.len())], b'\n')
594 };
595 let start_line = line_at(cap.def_start_byte);
596 let end_line = line_at(cap.def_end_byte.saturating_sub(1).max(cap.def_start_byte));
597 let symbol_line = line_at(cap.start_byte);
598 let qualified = format!("local.{}", cap.name);
599 // Canonical "local_attribute" kind so downstream consumers map it
600 // to LSP Constant via `lsp_symbol_kind_for_node_kind`.
601 contents.push(attr_text.clone());
602 chunks.push(CodeChunk {
603 file_path: rel_path.clone(),
604 name: cap.name.clone(),
605 kind: "local_attribute".to_string(),
606 content_kind,
607 start_line,
608 end_line,
609 symbol_line,
610 enriched_content: attr_text.clone(),
611 content: attr_text,
612 qualified_name: Some(qualified),
613 });
614 }
615 }
616
617 (chunks, contents)
618}
619
620/// Public re-export of [`chunk_one_file`] for integration tests in the
621/// `ripvec-core` test suite (e.g. `tests/repo_map_extractor.rs`).
622///
623/// The underlying function is intentionally private (callers in production
624/// reach it only via the pipeline staged by [`StaticEncoder::embed_root`]).
625/// This shim exists solely so cross-crate-style integration tests can
626/// exercise the per-file chunking path in isolation without standing up the
627/// full pipeline.
628#[must_use]
629pub fn chunk_one_file_pub(root: &Path, full: &Path) -> (Vec<CodeChunk>, Vec<String>) {
630 chunk_one_file(root, full)
631}
632
633#[cfg(test)]
634mod tests {
635 use super::*;
636 use crate::encoder::VectorEncoder;
637 use std::io::Write as _;
638
639 /// `test:chunk_one_file_populates_name_from_tree_sitter` — chunk_one_file
640 /// must populate `name` from tree-sitter when the source contains a
641 /// recognisable definition.
642 #[test]
643 fn chunk_one_file_populates_name_from_tree_sitter() {
644 let source = "pub fn add(a: i32, b: i32) -> i32 { a + b }\n";
645 let dir = tempfile::tempdir().expect("tempdir");
646 let path = dir.path().join("add.rs");
647 {
648 let mut f = std::fs::File::create(&path).expect("create");
649 f.write_all(source.as_bytes()).expect("write");
650 }
651 let (chunks, _) = chunk_one_file(dir.path(), &path);
652 assert!(
653 !chunks.is_empty(),
654 "expected at least one chunk from Rust source"
655 );
656 assert!(
657 chunks.iter().any(|c| c.name == "add"),
658 "expected at least one chunk with name 'add'; got names: {:?}",
659 chunks.iter().map(|c| c.name.as_str()).collect::<Vec<_>>()
660 );
661 }
662
663 /// `test:chunk_one_file_leaves_name_empty_when_no_identifier` — when the
664 /// source has no tree-sitter-recognisable definitions, name stays empty.
665 #[test]
666 fn chunk_one_file_leaves_name_empty_when_no_identifier() {
667 // Only whitespace and comments — no function/struct/enum definitions.
668 let source = "// just a comment\n \n// another comment\n";
669 let dir = tempfile::tempdir().expect("tempdir");
670 let path = dir.path().join("comments.rs");
671 {
672 let mut f = std::fs::File::create(&path).expect("create");
673 f.write_all(source.as_bytes()).expect("write");
674 }
675 let (chunks, _) = chunk_one_file(dir.path(), &path);
676 // Either no chunks at all, or all chunks have an empty name.
677 for c in &chunks {
678 assert!(
679 c.name.is_empty(),
680 "expected empty name for comment-only source; got {:?}",
681 c.name
682 );
683 }
684 }
685
686 /// `StaticEncoder` implements `VectorEncoder` + Send + Sync.
687 /// Compile-time check (`test:static-encoder-implements-vector-encoder`).
688 #[test]
689 fn static_encoder_implements_vector_encoder() {
690 fn assert_trait_object<T: VectorEncoder + Send + Sync>() {}
691 assert_trait_object::<StaticEncoder>();
692 }
693
694 // -------------------------------------------------------------------------
695 // B2: chunk_one_file kind-tagging tests
696 // -------------------------------------------------------------------------
697
698 /// Helper: write a temp file and return `(dir, path)`.
699 fn write_temp(source: &str, filename: &str) -> (tempfile::TempDir, std::path::PathBuf) {
700 let dir = tempfile::tempdir().expect("tempdir");
701 let path = dir.path().join(filename);
702 std::fs::write(&path, source).expect("write");
703 (dir, path)
704 }
705
706 /// `test:chunk_one_file_populates_kind_for_rust_struct` — `chunk_one_file`
707 /// emits a chunk whose `kind` is `"23"` (LSP Struct) for a `pub struct`.
708 ///
709 /// Behavior: trigger-fails-on-baseline-then-passes-post-fix.
710 /// On the baseline, `kind` was always `""` (empty string from the semble
711 /// chunker), so this test fails. Post-B2 the kind is the LSP numeric string.
712 #[test]
713 fn chunk_one_file_populates_kind_for_rust_struct() {
714 let source = "pub struct Foo { x: i32 }\n";
715 let (dir, path) = write_temp(source, "foo.rs");
716 let (chunks, _) = chunk_one_file(dir.path(), &path);
717 let struct_chunk = chunks.iter().find(|c| c.name == "Foo");
718 assert!(
719 struct_chunk.is_some(),
720 "expected a chunk named 'Foo'; got: {:?}",
721 chunks.iter().map(|c| c.name.as_str()).collect::<Vec<_>>()
722 );
723 let kind = &struct_chunk.unwrap().kind;
724 assert_eq!(
725 kind.as_str(),
726 "23",
727 "struct_item must emit LSP SymbolKind::Struct (23); got: {kind:?}"
728 );
729 }
730
731 /// `test:chunk_one_file_populates_kind_for_rust_trait` — `chunk_one_file`
732 /// emits a chunk whose `kind` is `"11"` (LSP Interface) for a trait.
733 #[test]
734 fn chunk_one_file_populates_kind_for_rust_trait() {
735 let source = "pub trait MyTrait { fn method(&self); }\n";
736 let (dir, path) = write_temp(source, "trait.rs");
737 let (chunks, _) = chunk_one_file(dir.path(), &path);
738 let trait_chunk = chunks.iter().find(|c| c.name == "MyTrait");
739 assert!(
740 trait_chunk.is_some(),
741 "expected a chunk named 'MyTrait'; got: {:?}",
742 chunks.iter().map(|c| c.name.as_str()).collect::<Vec<_>>()
743 );
744 let kind = &trait_chunk.unwrap().kind;
745 assert_eq!(
746 kind.as_str(),
747 "11",
748 "trait_item must emit LSP SymbolKind::Interface (11); got: {kind:?}"
749 );
750 }
751
752 /// `test:chunk_one_file_kind_distinct_from_variable_default` — after B2,
753 /// named chunks must not carry the old hardcoded `""` (empty) kind.
754 ///
755 /// Pre-B2 all chunks from the semble AST-merge path had `kind: String::new()`
756 /// (= `""`). This test ensures that chunks whose name is non-empty carry a
757 /// meaningful, non-empty LSP kind string.
758 ///
759 /// Note: The semble AST-merge chunker packs adjacent small definitions into a
760 /// single chunk and assigns only the FIRST capture's name. The kind test
761 /// therefore validates the overall invariant — named chunks have non-empty
762 /// kinds — rather than testing each definition independently (which requires
763 /// definitions large enough to occupy distinct chunks).
764 #[test]
765 fn chunk_one_file_kind_distinct_from_variable_default() {
766 // Use a file with a single, definitively-named struct so the chunk
767 // carries a meaningful kind. The semble chunker will emit one chunk
768 // with name "Qux" and kind "23" (Struct).
769 let source = "pub struct Qux { x: i32, y: i32 }\n";
770 let (dir, path) = write_temp(source, "qux.rs");
771 let (chunks, _) = chunk_one_file(dir.path(), &path);
772
773 // Find the named chunk.
774 let named_chunks: Vec<_> = chunks.iter().filter(|c| !c.name.is_empty()).collect();
775 assert!(
776 !named_chunks.is_empty(),
777 "expected at least one named chunk from Rust source with struct definition"
778 );
779
780 // Every named chunk must have a non-empty kind (pre-B2 regression: kind was "").
781 for c in &named_chunks {
782 assert!(
783 !c.kind.is_empty(),
784 "named chunk '{}' must have non-empty kind (pre-B2 regression); got empty",
785 c.name
786 );
787 }
788
789 // The struct chunk specifically must have kind "23" (LSP Struct).
790 let qux = named_chunks.iter().find(|c| c.name == "Qux");
791 if let Some(c) = qux {
792 assert_eq!(
793 c.kind.as_str(),
794 "23",
795 "Qux (struct_item) must emit LSP SymbolKind::Struct (23); got: {:?}",
796 c.kind
797 );
798 }
799 }
800
801 /// `from_pretrained` returns the right hidden_dim from a probe encode.
802 /// Ignored by default because it requires a model download (~16 MB).
803 ///
804 /// Corresponds to acceptance `test:static-encoder-hidden-dim-256` and
805 /// `test:static-encoder-loads-potion-code-16m` and
806 /// `test:static-encoder-output-is-l2-normalized`.
807 #[test]
808 #[ignore = "requires local model files at RIPVEC_SEMBLE_MODEL_PATH"]
809 fn static_encoder_loads_potion_code_16m() {
810 let Ok(path) = std::env::var("RIPVEC_SEMBLE_MODEL_PATH") else {
811 eprintln!("RIPVEC_SEMBLE_MODEL_PATH not set; skipping");
812 return;
813 };
814 let enc = StaticEncoder::from_pretrained(&path).expect("model load should succeed");
815 assert_eq!(enc.hidden_dim(), DEFAULT_HIDDEN_DIM);
816 // identity() reflects what the caller passed (typically the
817 // local path under test).
818 assert_eq!(enc.identity(), path);
819
820 // Verify L2-normalized output via the public encode_query path.
821 let row = enc.encode_query("hello world");
822 let norm: f32 = row.iter().map(|x| x * x).sum::<f32>().sqrt();
823 assert!(
824 (norm - 1.0).abs() < 1e-3,
825 "expected L2-normalized output; got norm={norm}"
826 );
827 }
828
829 // ── C2 Tests (4.1.1): Python decorator kind in dense.rs path ─────────
830
831 /// `test:dense_projection_uses_stored_kind_for_python_decorator` —
832 /// `chunk_one_file` uses the decorator-aware `lsp_symbol_kind_for_node`
833 /// so a Python `@classmethod`-decorated function emits `kind = "12"`
834 /// (Function) instead of `"7"` (Property, the AST-less mapping for
835 /// `decorated_definition`).
836 ///
837 /// Behavior: trigger-fails-on-baseline-then-passes-post-fix.
838 /// Baseline: `extract_name_captures` called `lsp_symbol_kind_for_node_kind`
839 /// which returns `PROPERTY` (7) for `"decorated_definition"`. Post-fix it
840 /// calls `lsp_symbol_kind_for_node` which inspects the first decorator.
841 #[test]
842 fn dense_projection_uses_stored_kind_for_python_decorator() {
843 let source = "@classmethod\ndef from_dict(cls, d):\n return cls()\n";
844 let (dir, path) = write_temp(source, "methods.py");
845 let (chunks, _) = chunk_one_file(dir.path(), &path);
846
847 let chunk = chunks
848 .iter()
849 .find(|c| c.name == "from_dict")
850 .expect("expected a chunk named 'from_dict'");
851 assert_eq!(
852 chunk.kind.as_str(),
853 "12",
854 "C2: @classmethod must emit kind=12 (Function), not 7 (Property); got {:?}",
855 chunk.kind
856 );
857 }
858
859 /// Verify the symmetry: `@property`-decorated functions should still emit
860 /// `kind = "7"` (Property) through the dense path.
861 #[test]
862 fn dense_projection_property_decorator_kind_is_7() {
863 let source = "@property\ndef name(self):\n return self._name\n";
864 let (dir, path) = write_temp(source, "props.py");
865 let (chunks, _) = chunk_one_file(dir.path(), &path);
866
867 let chunk = chunks
868 .iter()
869 .find(|c| c.name == "name")
870 .expect("expected a chunk named 'name'");
871 assert_eq!(
872 chunk.kind.as_str(),
873 "7",
874 "@property must emit kind=7 (Property); got {:?}",
875 chunk.kind
876 );
877 }
878}