ripvec_core/encoder/ripvec/dense.rs
1//! Static encoder: in-process `StaticEmbedModel` reimplementation.
2//!
3//! Port of `~/src/semble/src/semble/index/dense.py`. Wraps
4//! [`StaticEmbedModel`] loaded with `minishlab/potion-base-32M`
5//! (256-dim, L2-normalized). Implements [`VectorEncoder`] for the
6//! `--model ripvec` path. CPU-only; no batching ring buffer.
7//!
8//! Default was bumped to `potion-base-32M` in v1.3.0 after the
9//! gutenberg + python-repos matrix showed 32M winning prose by
10//! 0.058 NDCG@10 while losing code by only 0.004 — a clear
11//! single-default win once the i64 mapping bug and the reranker
12//! pooler / sigmoid / truncation bugs were fixed. The code-tuned
13//! `potion-code-16M` is still available via `--model-repo`.
14//!
15//! ## Why not `model2vec-rs`?
16//!
17//! The previous wave used the upstream `model2vec-rs` crate. Two real
18//! problems pushed us to reimplement (see
19//! `crates/ripvec-core/src/encoder/semble/static_model.rs` for the
20//! full design rationale):
21//!
22//! 1. `model2vec_rs::StaticModel::encode_with_args` runs `pool_ids`
23//! in a serial inner loop while `tokenizers::encode_batch_fast`
24//! spawns its own rayon pool. Wrapping that path in our outer
25//! `par_chunks` produced 60% `__psynch_cvwait` in the linux-corpus
26//! profile — nested rayon scopes parking on each other. The
27//! reimplementation does ONE big tokenize plus a `par_iter` over
28//! `pool_ids` — no nested rayon, no parking.
29//! 2. `model2vec-rs 0.2` pinned `ndarray 0.15`; ripvec-core uses
30//! `ndarray 0.17`. The two `Array2<f32>` types were not
31//! interchangeable, forcing a `Vec<Vec<f32>>` shim. Owning the
32//! load path eliminates the mismatch.
33
34use std::path::{Path, PathBuf};
35use std::sync::Mutex;
36
37use crossbeam_channel::bounded;
38use hf_hub::api::sync::Api;
39use rayon::prelude::*;
40
41use streaming_iterator::StreamingIterator;
42use tree_sitter::{Parser, QueryCursor};
43
44use crate::chunk::CodeChunk;
45use crate::embed::SearchConfig;
46use crate::encoder::VectorEncoder;
47use crate::encoder::ripvec::chunking::{DEFAULT_DESIRED_CHUNK_CHARS, chunk_source};
48use crate::encoder::ripvec::static_model::StaticEmbedModel;
49use crate::languages::config_for_extension;
50use crate::profile::Profiler;
51use crate::walk::collect_files_with_options;
52
53/// Encode batch size used by the streaming pipeline. Matches
54/// `StaticEmbedModel`'s internal `BATCH_SIZE` so each emitted batch
55/// is exactly one `encode_batch_fast` call's worth of work.
56const PIPELINE_BATCH_SIZE: usize = 1024;
57
58/// Number of full batches allowed in-flight from chunker to encoder.
59/// Provides enough pipeline depth for the encoder to stay busy while
60/// the chunker fills the next batch; small enough that peak memory
61/// stays bounded.
62const PIPELINE_RING_SIZE: usize = 4;
63
64/// Default model repo identifier for the ripvec path. This is the HF
65/// repo string used as `identity()`; the loader reads files from a
66/// local path passed via `--model-repo`.
67pub const DEFAULT_MODEL_REPO: &str = "minishlab/potion-base-32M";
68
69/// Default hidden dimension for [`DEFAULT_MODEL_REPO`].
70pub const DEFAULT_HIDDEN_DIM: usize = 256;
71
72/// Maximum source file size to read, in bytes (mirrors semble's
73/// `_MAX_FILE_BYTES = 1_000_000` from `index/create.py:16`).
74const MAX_FILE_BYTES: u64 = 1_000_000;
75
76/// CPU-only static encoder.
77///
78/// Owns a loaded [`StaticEmbedModel`] plus identity metadata. The
79/// embedder is constructed by `main.rs::load_pipeline` via
80/// [`StaticEncoder::from_pretrained`], passing either a local path
81/// containing the Model2Vec files or (planned) an HF repo ID.
82pub struct StaticEncoder {
83 model: StaticEmbedModel,
84 model_repo: String,
85 hidden_dim: usize,
86}
87
88impl StaticEncoder {
89 /// Encode a query string into a single embedding row.
90 ///
91 /// Used by `RipvecIndex::search` for hybrid/semantic dispatch.
92 #[must_use]
93 pub fn encode_query(&self, query: &str) -> Vec<f32> {
94 self.model.encode_query(query)
95 }
96
97 /// Load a model by HuggingFace repo ID or local path.
98 ///
99 /// Two acceptance shapes:
100 ///
101 /// 1. **Local path** — if `model_repo` names an existing directory,
102 /// load directly from it. Used by the parity test fixture path
103 /// (`/tmp/potion-base-32M`) and any user pre-staging files.
104 /// 2. **HuggingFace repo ID** — otherwise treat as `org/repo`,
105 /// download `config.json` / `tokenizer.json` / `model.safetensors`
106 /// via `hf-hub` into `~/.cache/huggingface/hub/`, and load from
107 /// there. Matches `load_classic_cpu` / `load_modernbert_cpu`'s
108 /// behaviour so the user-facing API is consistent: bare `--model
109 /// ripvec` with no `--model-repo` flag works.
110 ///
111 /// # Errors
112 ///
113 /// Propagates the underlying I/O, download, or parse error if the
114 /// files cannot be obtained or the safetensors layout is
115 /// unrecognized.
116 pub fn from_pretrained(model_repo: &str) -> crate::Result<Self> {
117 let resolved = Self::resolve_model_dir(model_repo)?;
118 let model = StaticEmbedModel::from_path(&resolved, Some(true))
119 .map_err(|e| crate::Error::Other(anyhow::anyhow!("static model load failed: {e}")))?;
120 let hidden_dim = model.hidden_dim();
121 Ok(Self {
122 model,
123 model_repo: model_repo.to_string(),
124 hidden_dim,
125 })
126 }
127
128 /// Resolve `model_repo` to a directory containing the model files.
129 ///
130 /// If `model_repo` is an existing local directory, returns it as-is.
131 /// Otherwise downloads via `hf-hub` and returns the cache directory.
132 fn resolve_model_dir(model_repo: &str) -> crate::Result<PathBuf> {
133 let local = Path::new(model_repo);
134 if local.is_dir() {
135 return Ok(local.to_path_buf());
136 }
137
138 // HuggingFace repo path. Download the three required files and
139 // return the directory `hf-hub` cached them into. All files
140 // land in the same snapshot directory.
141 let api = Api::new().map_err(|e| crate::Error::Download(e.to_string()))?;
142 let repo = api.model(model_repo.to_string());
143 let _ = repo
144 .get("config.json")
145 .map_err(|e| crate::Error::Download(e.to_string()))?;
146 let _ = repo
147 .get("tokenizer.json")
148 .map_err(|e| crate::Error::Download(e.to_string()))?;
149 let weights_path = repo
150 .get("model.safetensors")
151 .map_err(|e| crate::Error::Download(e.to_string()))?;
152 // hf-hub returns the file path; the snapshot directory is its parent.
153 weights_path
154 .parent()
155 .map(std::path::Path::to_path_buf)
156 .ok_or_else(|| {
157 crate::Error::Other(anyhow::anyhow!(
158 "hf-hub returned root path for {model_repo}; cannot resolve snapshot dir"
159 ))
160 })
161 }
162
163 /// Chunk + embed an explicit list of files, skipping the walk.
164 ///
165 /// Used by [`RipvecIndex::apply_diff`](crate::encoder::ripvec::index::RipvecIndex::apply_diff)
166 /// to incrementally re-embed just the files that changed since the
167 /// last reconcile. `root` is the corpus root the paths are
168 /// relative to (used for the chunker's `rel_path` field, matching
169 /// what [`VectorEncoder::embed_root`] writes for unchanged files).
170 ///
171 /// Returns `(chunks, embeddings)` in flat lists; ordering mirrors
172 /// the per-file traversal order of `paths`. Files that fail to
173 /// read or chunk are silently skipped (same policy as
174 /// [`chunk_one_file`]).
175 ///
176 /// # Why a separate method
177 ///
178 /// [`VectorEncoder::embed_root`] is a heavy three-stage pipeline
179 /// optimized for full-corpus builds (thousands of files). For the
180 /// "1-50 files changed" case that drives reconciliation, the
181 /// sequential single-batch path here is simpler and faster: no
182 /// rayon pool spin-up, no bounded channels, no inter-stage
183 /// hand-off cost. The batch encode is a single [`encode_batch`]
184 /// call.
185 ///
186 /// # Errors
187 ///
188 /// Returns the underlying error if `encode_batch` fails.
189 pub fn embed_paths(
190 &self,
191 root: &Path,
192 paths: &[std::path::PathBuf],
193 profiler: &Profiler,
194 ) -> crate::Result<(Vec<CodeChunk>, Vec<Vec<f32>>)> {
195 let _guard = profiler.phase("embed_paths");
196 let mut chunks_out: Vec<CodeChunk> = Vec::new();
197 let mut texts: Vec<String> = Vec::new();
198 for path in paths {
199 let (file_chunks, file_texts) = chunk_one_file(root, path);
200 chunks_out.extend(file_chunks);
201 texts.extend(file_texts);
202 }
203 if chunks_out.is_empty() {
204 return Ok((Vec::new(), Vec::new()));
205 }
206 let text_refs: Vec<&str> = texts.iter().map(String::as_str).collect();
207 let embeddings = self.model.encode_batch(&text_refs);
208 debug_assert_eq!(embeddings.len(), chunks_out.len());
209 Ok((chunks_out, embeddings))
210 }
211}
212
213impl VectorEncoder for StaticEncoder {
214 /// Three-stage bounded-queue pipeline:
215 ///
216 /// 1. **Chunk producer** — rayon `par_iter` over the file list. Each
217 /// file is read, parsed by tree-sitter (or line-merged on
218 /// fallback), and emitted as `(CodeChunk, String)` pairs into a
219 /// bounded channel of capacity `PIPELINE_BATCH_SIZE * 8`.
220 /// 2. **Batch accumulator** — a single scoped thread drains the
221 /// chunk channel, packs `PIPELINE_BATCH_SIZE` pairs per batch,
222 /// and forwards into a bounded channel of capacity
223 /// `PIPELINE_RING_SIZE`.
224 /// 3. **Encode worker** — a single scoped thread receives batches
225 /// and calls `StaticEmbedModel::encode_batch`, whose internal
226 /// `par_iter` lights up rayon for the pool_ids kernel.
227 ///
228 /// Why this shape:
229 ///
230 /// - The previous "chunk all, then embed all" implementation held
231 /// the entire `Vec<String>` of chunk contents in memory between
232 /// phases. On the linux corpus that was ~400 MB peak. The
233 /// bounded queues cap in-flight memory at
234 /// `PIPELINE_BATCH_SIZE * 8 + PIPELINE_RING_SIZE * PIPELINE_BATCH_SIZE`
235 /// chunks regardless of corpus size — under 15 MB.
236 /// - The chunk phase (13s on linux) is hidden inside the embed
237 /// phase (70s) instead of serializing before it. Pre-pipeline
238 /// profile showed user-time at 394s on 82s wall = 4.8x
239 /// parallelism on 12 cores; pipeline lets idle cores chew on
240 /// chunking while embed runs.
241 /// - Mirrors `embed::embed_all_streaming`'s shape so the two
242 /// pipelines (BERT + semble) share architectural conventions.
243 fn embed_root(
244 &self,
245 root: &Path,
246 cfg: &SearchConfig,
247 profiler: &Profiler,
248 ) -> crate::Result<(Vec<CodeChunk>, Vec<Vec<f32>>)> {
249 // Phase 1: walk (still serial-to-pipeline because we need the
250 // full file list to par_iter over; the walk itself is rayon).
251 let walk_options = cfg.walk_options();
252 let file_paths = {
253 let _guard = profiler.phase("walk");
254 collect_files_with_options(root, &walk_options)
255 };
256 if file_paths.is_empty() {
257 return Ok((Vec::new(), Vec::new()));
258 }
259
260 // Bounded channels. See module constants for the rationale on
261 // PIPELINE_BATCH_SIZE and PIPELINE_RING_SIZE.
262 let (chunk_tx, chunk_rx) = bounded::<(CodeChunk, String)>(PIPELINE_BATCH_SIZE * 8);
263 let (batch_tx, batch_rx) = bounded::<Vec<(CodeChunk, String)>>(PIPELINE_RING_SIZE);
264
265 // The encoder stage writes ordered output behind a Mutex. Order
266 // across files isn't meaningful (RipvecIndex doesn't rely on
267 // chunk order), only the chunk[i] <-> embedding[i] pairing
268 // matters — which we preserve trivially by pushing in lockstep.
269 let output: Mutex<Vec<(CodeChunk, Vec<f32>)>> = Mutex::new(Vec::new());
270 let model = &self.model;
271
272 // Stage 1 runs on a DEDICATED rayon thread pool. If we used
273 // the global pool, Stage 1's par_iter workers would park on
274 // full `chunk_tx.send()` calls, and Stage 3's
275 // `encode_batch` → `pool_ids` par_iter would have no rayon
276 // workers available (they're all parked). That's a classic
277 // nested-rayon deadlock — observed in profiling as PID stuck
278 // at 0% CPU with 16 parked threads.
279 //
280 // Half the cores for chunking, half remain in the global pool
281 // for the encode worker's pool_ids. The chunk phase (tree-
282 // sitter + I/O bound) doesn't need full parallelism to
283 // pipeline cleanly behind embed.
284 let num_cores = rayon::current_num_threads().max(2);
285 let chunk_threads = (num_cores / 2).max(1);
286 let chunk_pool = rayon::ThreadPoolBuilder::new()
287 .num_threads(chunk_threads)
288 .thread_name(|i| format!("semble-chunk-{i}"))
289 .build()
290 .map_err(|e| crate::Error::Other(anyhow::anyhow!("chunk thread pool build: {e}")))?;
291
292 let _phase_guard = profiler.phase("pipeline");
293 std::thread::scope(|scope| {
294 // Stage 1: chunk producer on the dedicated pool.
295 let chunk_tx_owned = chunk_tx;
296 scope.spawn(move || {
297 chunk_pool.install(|| {
298 file_paths.par_iter().for_each(|full| {
299 let (chunks, contents) = chunk_one_file(root, full);
300 for (chunk, content) in chunks.into_iter().zip(contents) {
301 if chunk_tx_owned.send((chunk, content)).is_err() {
302 return;
303 }
304 }
305 });
306 });
307 // chunk_tx_owned drops here, closing the channel.
308 });
309
310 // Stage 2: batch accumulator.
311 let batch_tx_owned = batch_tx;
312 scope.spawn(move || {
313 let mut buf: Vec<(CodeChunk, String)> = Vec::with_capacity(PIPELINE_BATCH_SIZE);
314 for pair in chunk_rx {
315 buf.push(pair);
316 if buf.len() >= PIPELINE_BATCH_SIZE {
317 let batch =
318 std::mem::replace(&mut buf, Vec::with_capacity(PIPELINE_BATCH_SIZE));
319 if batch_tx_owned.send(batch).is_err() {
320 return;
321 }
322 }
323 }
324 if !buf.is_empty() {
325 let _ = batch_tx_owned.send(buf);
326 }
327 // batch_tx_owned drops here, closing the channel.
328 });
329
330 // Stage 3: encode worker.
331 scope.spawn(|| {
332 for batch in batch_rx {
333 if batch.is_empty() {
334 continue;
335 }
336 let mut chunks = Vec::with_capacity(batch.len());
337 let mut texts: Vec<String> = Vec::with_capacity(batch.len());
338 for (chunk, text) in batch {
339 chunks.push(chunk);
340 texts.push(text);
341 }
342 let text_refs: Vec<&str> = texts.iter().map(String::as_str).collect();
343 let embeddings = model.encode_batch(&text_refs);
344 debug_assert_eq!(embeddings.len(), chunks.len());
345 let mut out = output.lock().expect("output mutex poisoned");
346 for (chunk, emb) in chunks.into_iter().zip(embeddings) {
347 out.push((chunk, emb));
348 }
349 }
350 });
351 });
352
353 let collected = output.into_inner().expect("output mutex poisoned");
354 let mut chunks_out = Vec::with_capacity(collected.len());
355 let mut embs_out = Vec::with_capacity(collected.len());
356 for (chunk, emb) in collected {
357 chunks_out.push(chunk);
358 embs_out.push(emb);
359 }
360 Ok((chunks_out, embs_out))
361 }
362
363 fn hidden_dim(&self) -> usize {
364 self.hidden_dim
365 }
366
367 fn identity(&self) -> &str {
368 &self.model_repo
369 }
370}
371
372/// Extract `@name` capture positions from a tree-sitter parse of `source`
373/// using the language config's compiled query.
374///
375/// Returns a list of `(start_byte, end_byte, name_text)` for every `@name`
376/// capture found. The list is sorted by `start_byte` so callers can do a
377/// linear scan per chunk boundary.
378///
379/// Performs exactly one parse and one query execution per `chunk_one_file`
380/// call — O(1) parses regardless of the number of chunks.
381fn extract_name_captures(
382 source: &str,
383 lang_cfg: &crate::languages::LangConfig,
384) -> Vec<(usize, usize, String)> {
385 let mut parser = Parser::new();
386 if parser.set_language(&lang_cfg.language).is_err() {
387 return Vec::new();
388 }
389 let Some(tree) = parser.parse(source, None) else {
390 return Vec::new();
391 };
392 let mut cursor = QueryCursor::new();
393 let mut matches = cursor.matches(&lang_cfg.query, tree.root_node(), source.as_bytes());
394 let capture_names = lang_cfg.query.capture_names();
395 let mut result: Vec<(usize, usize, String)> = Vec::new();
396 while let Some(m) = matches.next() {
397 for cap in m.captures {
398 if capture_names[cap.index as usize] == "name" {
399 let start = cap.node.start_byte();
400 let end = cap.node.end_byte();
401 if end <= source.len() {
402 let name = source[start..end].to_string();
403 result.push((start, end, name));
404 }
405 }
406 }
407 }
408 // Sort by byte position so we can scan linearly per boundary.
409 result.sort_unstable_by_key(|&(s, _, _)| s);
410 result
411}
412
413/// Find the best name for a chunk covering `[chunk_start, chunk_end)` bytes.
414///
415/// "Best" = the first `@name` capture whose start byte falls inside the
416/// chunk's byte range. Returns `""` if none found (graceful fallback).
417fn name_for_chunk(
418 captures: &[(usize, usize, String)],
419 chunk_start: usize,
420 chunk_end: usize,
421) -> &str {
422 for (start, end, name) in captures {
423 if *start >= chunk_start && *end <= chunk_end {
424 return name.as_str();
425 }
426 // Since captures are sorted by start byte, once we pass chunk_end
427 // there can be no more candidates.
428 if *start >= chunk_end {
429 break;
430 }
431 }
432 ""
433}
434
435/// Chunk one file. Returns `(file_chunks, file_contents)` — empty
436/// when the file is too large, can't be read, or has no chunks.
437fn chunk_one_file(root: &Path, full: &Path) -> (Vec<CodeChunk>, Vec<String>) {
438 match std::fs::metadata(full) {
439 Ok(meta) if meta.len() > MAX_FILE_BYTES => return (Vec::new(), Vec::new()),
440 Err(_) => return (Vec::new(), Vec::new()),
441 _ => {}
442 }
443 let Ok(source) = std::fs::read_to_string(full) else {
444 return (Vec::new(), Vec::new());
445 };
446
447 let ext = full
448 .extension()
449 .and_then(|e| e.to_str())
450 .unwrap_or_default();
451 let lang_cfg = config_for_extension(ext);
452 let language = lang_cfg.as_ref().map(|c| &c.language);
453
454 // Parse once per file to collect all `@name` captures for name
455 // population. Falls back to an empty list when there is no
456 // language config or the parse fails — chunk names remain "".
457 let name_captures: Vec<(usize, usize, String)> = lang_cfg
458 .as_deref()
459 .map(|cfg| extract_name_captures(&source, cfg))
460 .unwrap_or_default();
461
462 let rel_path = full
463 .strip_prefix(root)
464 .unwrap_or(full)
465 .display()
466 .to_string();
467
468 let boundaries = chunk_source(&source, language, DEFAULT_DESIRED_CHUNK_CHARS);
469 let mut chunks = Vec::with_capacity(boundaries.len());
470 let mut contents = Vec::with_capacity(boundaries.len());
471 for b in boundaries {
472 let text = b.content(&source).to_string();
473 if text.trim().is_empty() {
474 continue;
475 }
476 let name = name_for_chunk(&name_captures, b.start_byte, b.end_byte).to_string();
477 contents.push(text.clone());
478 chunks.push(CodeChunk {
479 file_path: rel_path.clone(),
480 name,
481 kind: String::new(),
482 start_line: b.start_line,
483 end_line: b.end_line,
484 content: text.clone(),
485 enriched_content: text,
486 });
487 }
488 (chunks, contents)
489}
490
491#[cfg(test)]
492mod tests {
493 use super::*;
494 use crate::encoder::VectorEncoder;
495 use std::io::Write as _;
496
497 /// `test:chunk_one_file_populates_name_from_tree_sitter` — chunk_one_file
498 /// must populate `name` from tree-sitter when the source contains a
499 /// recognisable definition.
500 #[test]
501 fn chunk_one_file_populates_name_from_tree_sitter() {
502 let source = "pub fn add(a: i32, b: i32) -> i32 { a + b }\n";
503 let dir = tempfile::tempdir().expect("tempdir");
504 let path = dir.path().join("add.rs");
505 {
506 let mut f = std::fs::File::create(&path).expect("create");
507 f.write_all(source.as_bytes()).expect("write");
508 }
509 let (chunks, _) = chunk_one_file(dir.path(), &path);
510 assert!(
511 !chunks.is_empty(),
512 "expected at least one chunk from Rust source"
513 );
514 assert!(
515 chunks.iter().any(|c| c.name == "add"),
516 "expected at least one chunk with name 'add'; got names: {:?}",
517 chunks.iter().map(|c| c.name.as_str()).collect::<Vec<_>>()
518 );
519 }
520
521 /// `test:chunk_one_file_leaves_name_empty_when_no_identifier` — when the
522 /// source has no tree-sitter-recognisable definitions, name stays empty.
523 #[test]
524 fn chunk_one_file_leaves_name_empty_when_no_identifier() {
525 // Only whitespace and comments — no function/struct/enum definitions.
526 let source = "// just a comment\n \n// another comment\n";
527 let dir = tempfile::tempdir().expect("tempdir");
528 let path = dir.path().join("comments.rs");
529 {
530 let mut f = std::fs::File::create(&path).expect("create");
531 f.write_all(source.as_bytes()).expect("write");
532 }
533 let (chunks, _) = chunk_one_file(dir.path(), &path);
534 // Either no chunks at all, or all chunks have an empty name.
535 for c in &chunks {
536 assert!(
537 c.name.is_empty(),
538 "expected empty name for comment-only source; got {:?}",
539 c.name
540 );
541 }
542 }
543
544 /// `StaticEncoder` implements `VectorEncoder` + Send + Sync.
545 /// Compile-time check (`test:static-encoder-implements-vector-encoder`).
546 #[test]
547 fn static_encoder_implements_vector_encoder() {
548 fn assert_trait_object<T: VectorEncoder + Send + Sync>() {}
549 assert_trait_object::<StaticEncoder>();
550 }
551
552 /// `from_pretrained` returns the right hidden_dim from a probe encode.
553 /// Ignored by default because it requires a model download (~16 MB).
554 ///
555 /// Corresponds to acceptance `test:static-encoder-hidden-dim-256` and
556 /// `test:static-encoder-loads-potion-code-16m` and
557 /// `test:static-encoder-output-is-l2-normalized`.
558 #[test]
559 #[ignore = "requires local model files at RIPVEC_SEMBLE_MODEL_PATH"]
560 fn static_encoder_loads_potion_code_16m() {
561 let Ok(path) = std::env::var("RIPVEC_SEMBLE_MODEL_PATH") else {
562 eprintln!("RIPVEC_SEMBLE_MODEL_PATH not set; skipping");
563 return;
564 };
565 let enc = StaticEncoder::from_pretrained(&path).expect("model load should succeed");
566 assert_eq!(enc.hidden_dim(), DEFAULT_HIDDEN_DIM);
567 // identity() reflects what the caller passed (typically the
568 // local path under test).
569 assert_eq!(enc.identity(), path);
570
571 // Verify L2-normalized output via the public encode_query path.
572 let row = enc.encode_query("hello world");
573 let norm: f32 = row.iter().map(|x| x * x).sum::<f32>().sqrt();
574 assert!(
575 (norm - 1.0).abs() < 1e-3,
576 "expected L2-normalized output; got norm={norm}"
577 );
578 }
579}