ripvec-core 2.1.0

Semantic code + document search engine. Cacheless static-embedding + cross-encoder rerank by default; optional ModernBERT/BGE transformer engines with GPU backends. Tree-sitter chunking, hybrid BM25 + PageRank, composable ranking layers.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
//! Hybrid search: RRF fusion of semantic + BM25, then boosts and rerank.
//!
//! Port of `~/src/semble/src/semble/search.py`. Three entry points:
//!
//! - [`search_semantic`] — cosine similarity over the dense index.
//! - [`search_bm25`](crate::encoder::ripvec::bm25::search_bm25) — BM25
//!   scoring (re-exported from the bm25 module).
//! - [`search_hybrid`] — fuses both ranked lists via Reciprocal Rank
//!   Fusion (k=60), over-fetching `top_k * 5` candidates, then applies
//!   ripvec's `boost_multi_chunk_files` + `apply_query_boost` + the
//!   penalty-aware `rerank_topk`.

use std::collections::{HashMap, HashSet};

use ndarray::{Array1, Array2, ArrayView1, s};
use rayon::prelude::*;

use crate::chunk::CodeChunk;
use crate::encoder::ripvec::bm25::{Bm25Index, search_bm25};
use crate::encoder::ripvec::penalties::rerank_topk;
use crate::encoder::ripvec::ranking::{apply_query_boost, boost_multi_chunk_files, resolve_alpha};

/// Reciprocal Rank Fusion smoothing constant. Matches Python
/// `_RRF_K = 60` from `search.py:11`.
pub const RRF_K: f32 = 60.0;

/// Over-fetch factor when assembling the hybrid candidate pool.
const CANDIDATE_MULTIPLIER: usize = 5;

/// Parallel matrix-vector multiply: `scores = matrix @ vector`.
///
/// Splits the matrix into one row-chunk per rayon worker. Each worker
/// computes its slice's sgemv via ndarray's BLAS dispatch and writes
/// into a disjoint output range. The chunk size is rounded up so the
/// number of shards equals the rayon worker count (no work-stealing
/// imbalance for symmetric input).
///
/// For a 1M-row × 256-col matrix on a 12-core M2 Max this approaches
/// the aggregate memory-bandwidth ceiling (~250 GB/s) instead of the
/// single-core ceiling (~50-80 GB/s) Accelerate's serial sgemv
/// otherwise caps us at.
/// Row count below which a single serial BLAS sgemv is faster than
/// rayon-sharded parallel sgemv (the per-thread dispatch overhead
/// dominates the inner work for small matrices).
const SGEMV_SERIAL_THRESHOLD: usize = 4096;

/// Parallel matrix-vector multiply via row-sharded BLAS sgemv.
///
/// See call site in `search_semantic` for the rationale; in short,
/// Accelerate's level-2 BLAS is single-threaded on macOS, so we shard
/// the matrix into row-chunks and call sgemv per worker to saturate
/// aggregate memory bandwidth.
///
/// # Panics
///
/// Panics if ndarray returns a non-contiguous slice from
/// `Array2::slice(s![start..end, ..])`. Row slices of a row-major
/// matrix are always contiguous, so this is structurally unreachable;
/// the panic guards against future layout changes that would silently
/// break correctness.
#[must_use]
pub fn parallel_sgemv(matrix: &Array2<f32>, vector: &ArrayView1<f32>) -> Array1<f32> {
    let n = matrix.nrows();
    if n == 0 {
        return Array1::zeros(0);
    }
    let n_threads = rayon::current_num_threads().max(1);
    if n <= SGEMV_SERIAL_THRESHOLD || n_threads == 1 {
        return matrix.dot(vector);
    }
    let chunk_size = n.div_ceil(n_threads);
    let mut scores = vec![0.0_f32; n];
    scores
        .par_chunks_mut(chunk_size)
        .enumerate()
        .for_each(|(thread_idx, out)| {
            let start = thread_idx * chunk_size;
            let end = (start + out.len()).min(n);
            let slice = matrix.slice(s![start..end, ..]);
            let local: Array1<f32> = slice.dot(vector);
            // SAFETY in spirit: `local` length == `out` length by
            // construction (`out.len() == end - start` from
            // par_chunks_mut, and `slice.nrows() == end - start`).
            out.copy_from_slice(local.as_slice().expect("sgemv output contiguous"));
        });
    // `Array1::from_vec` is O(1).
    Array1::from_vec(scores)
}

/// Pure semantic search: rank every chunk by dot product against the
/// query embedding, then take the top-k after optional selector mask.
///
/// Math:
///   scores = chunk_embeddings @ query_embedding
///   top-k by select_nth_unstable_by, then sort the survivors.
///
/// `chunk_embeddings` is row-major `[n_chunks, hidden_dim]`; with the
/// `cpu-accelerate` feature ndarray's `.dot()` dispatches to Accelerate's
/// `cblas_sgemv`, which is vendor-tuned and near memory-bandwidth-bound
/// (1 GB read per query at ~250 GB/s = ~4 ms theoretical floor on 1M
/// chunks at 256 dim). Earlier scalar pointer-chasing path took 583
/// ms per query (profile: samply v1, 2026-05-21).
///
/// Top-k uses `select_nth_unstable_by` (O(N) average) instead of a
/// full sort (O(N log N)) — at 1M chunks selecting top-100 that's
/// ~1M ops vs ~20M.
#[must_use]
pub fn search_semantic(
    query_embedding: &[f32],
    chunk_embeddings: &Array2<f32>,
    top_k: usize,
    selector: Option<&[usize]>,
) -> Vec<(usize, f32)> {
    let n_chunks = chunk_embeddings.nrows();
    if top_k == 0 || n_chunks == 0 {
        return Vec::new();
    }
    debug_assert_eq!(
        query_embedding.len(),
        chunk_embeddings.ncols(),
        "query embedding dim ({}) != chunk embedding dim ({})",
        query_embedding.len(),
        chunk_embeddings.ncols(),
    );

    // GEMV: scores[i] = sum_d chunk_embeddings[i, d] * query[d].
    //
    // Accelerate's level-2 BLAS (`cblas_sgemv`) is single-threaded on
    // macOS — only level-3 (GEMM) gets the multi-thread treatment.
    // Single-core memory bandwidth on M2 Max is ~50-80 GB/s; the
    // 1M-chunk × 256-dim matrix is 1 GB, so a single sgemv pays
    // ~12-20 ms just on memory bandwidth and we measured ~76 ms in
    // the profile.
    //
    // Fix: shard the matrix into row-chunks and dispatch one sgemv
    // per rayon worker. Each thread reads its slice independently;
    // aggregate bandwidth on M2 Max scales to ~250 GB/s with all
    // cores active. Theoretical floor drops to ~4 ms. Each shard's
    // sgemv is itself BLAS-optimal; we just stop forcing serial.
    let query: ArrayView1<f32> = ArrayView1::from(query_embedding);
    let scores: Array1<f32> = parallel_sgemv(chunk_embeddings, &query);

    // Filter by selector if set. Build a HashSet for O(1) membership;
    // at 1M chunks the HashSet is ~50 ms to build but per-chunk lookup
    // amortises against the avoided dense scoring elsewhere.
    let selector_set: Option<HashSet<usize>> =
        selector.map(|s| s.iter().copied().collect());

    let mut scored: Vec<(usize, f32)> = if let Some(set) = selector_set {
        scores
            .iter()
            .enumerate()
            .filter(|(i, _)| set.contains(i))
            .map(|(i, &s)| (i, s))
            .collect()
    } else {
        // No selector: keep everything (we'll partial-sort below).
        scores
            .iter()
            .enumerate()
            .map(|(i, &s)| (i, s))
            .collect()
    };

    // Top-k via O(N) selection. `select_nth_unstable_by` partitions
    // around the k-th element; everything before it is in (unsorted)
    // top-k. We then sort that small slice to recover the ordering.
    if scored.len() > top_k {
        scored.select_nth_unstable_by(top_k - 1, |a, b| {
            b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0))
        });
        scored.truncate(top_k);
    }
    scored.sort_unstable_by(|a, b| b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
    scored
}

/// Convert a list of `(index, raw_score)` to RRF scores.
/// `rrf_score = 1 / (RRF_K + rank)` where rank is 1-based and the
/// list is sorted descending by raw_score.
fn rrf_scores(ranked: &[(usize, f32)]) -> HashMap<usize, f32> {
    ranked
        .iter()
        .enumerate()
        .map(|(rank0, (idx, _))| {
            let rank = rank0 as f32 + 1.0;
            (*idx, 1.0 / (RRF_K + rank))
        })
        .collect()
}

/// Hybrid search: alpha-weighted RRF fusion of semantic + BM25,
/// followed by file-coherence + query boosts and the penalty-aware
/// reranker. Mirrors `search.py:search_hybrid`.
///
/// `query_embedding` is the embedding of `query` produced by the same
/// encoder that populated `chunk_embeddings`.
///
/// Over-fetches `top_k * 5` candidates from both sub-searches before
/// fusing, so the merged pool is large enough that the boosts and
/// reranker can do meaningful work.
#[must_use]
pub fn search_hybrid(
    query: &str,
    query_embedding: &[f32],
    chunk_embeddings: &Array2<f32>,
    chunks: &[CodeChunk],
    bm25: &Bm25Index,
    top_k: usize,
    alpha: Option<f32>,
    selector: Option<&[usize]>,
) -> Vec<(usize, f32)> {
    if top_k == 0 || chunks.is_empty() {
        return Vec::new();
    }
    let alpha_weight = resolve_alpha(query, alpha);
    let candidate_count = top_k.saturating_mul(CANDIDATE_MULTIPLIER);

    let semantic = search_semantic(query_embedding, chunk_embeddings, candidate_count, selector);
    let bm25_hits = search_bm25(query, bm25, candidate_count, selector);

    let normalized_semantic = rrf_scores(&semantic);
    let normalized_bm25 = rrf_scores(&bm25_hits);

    // Union of all chunks present in either ranked list.
    let mut combined: HashMap<usize, f32> = HashMap::new();
    let union: HashSet<usize> = normalized_semantic
        .keys()
        .chain(normalized_bm25.keys())
        .copied()
        .collect();
    for idx in union {
        let s = normalized_semantic.get(&idx).copied().unwrap_or(0.0);
        let b = normalized_bm25.get(&idx).copied().unwrap_or(0.0);
        combined.insert(idx, alpha_weight * s + (1.0 - alpha_weight) * b);
    }

    // Multi-chunk-file boost (in-place).
    boost_multi_chunk_files(&mut combined, chunks);
    // Query-type boost (returns a new map; matches Python's behaviour).
    let boosted = apply_query_boost(&combined, query, chunks);

    // Path penalties + saturation rerank.
    // Semble disables path penalties for pure-semantic queries (α=1.0);
    // alpha_weight comes from resolve_alpha so the < 1.0 condition matches
    // Python's `penalise_paths=alpha_weight < 1.0` at search.py:121.
    let penalise_paths = alpha_weight < 1.0;
    let scores_vec: Vec<(usize, f32)> = boosted.into_iter().collect();
    rerank_topk(&scores_vec, chunks, top_k, penalise_paths)
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::encoder::ripvec::bm25::Bm25Index;

    fn chunk(path: &str, content: &str) -> CodeChunk {
        CodeChunk {
            file_path: path.to_string(),
            name: String::new(),
            kind: String::new(),
            start_line: 1,
            end_line: 1,
            content: content.to_string(),
            enriched_content: content.to_string(),
        }
    }

    fn unit_vec(values: &[f32]) -> Vec<f32> {
        let norm: f32 = values.iter().map(|x| x * x).sum::<f32>().sqrt().max(1e-12);
        values.iter().map(|x| x / norm).collect()
    }

    /// `test:rrf-k-60` — RRF scores use k=60 with 1-based ranks.
    /// Rank 1 → 1/61; rank 2 → 1/62; rank 3 → 1/63.
    #[test]
    fn rrf_k_60() {
        let ranked = vec![(7, 0.9), (3, 0.8), (5, 0.5)];
        let rrf = rrf_scores(&ranked);
        assert!((rrf[&7] - 1.0 / 61.0).abs() < 1e-7);
        assert!((rrf[&3] - 1.0 / 62.0).abs() < 1e-7);
        assert!((rrf[&5] - 1.0 / 63.0).abs() < 1e-7);
    }

    /// `test:hybrid-candidate-count-5x-top-k` — when both sub-searches
    /// produce enough hits, hybrid over-fetches 5x top_k.
    #[test]
    fn hybrid_candidate_count_5x_top_k() {
        // 10 chunks; embedding = a unit vector that aligns with chunk
        // idx. Query embedding aligns most strongly with chunk 0.
        let chunks: Vec<CodeChunk> = (0..10)
            .map(|i| chunk(&format!("src/f{i}.rs"), &format!("content {i}")))
            .collect();
        let flat: Vec<f32> = (0..10)
            .flat_map(|i| {
                let mut v = vec![0.0_f32; 10];
                v[i] = 1.0;
                v
            })
            .collect();
        let embeddings = Array2::from_shape_vec((10, 10), flat).unwrap();
        let query_emb = unit_vec(&{
            let mut q = vec![0.0_f32; 10];
            q[0] = 1.0;
            q
        });
        let bm25 = Bm25Index::build(&chunks);
        let results = search_hybrid(
            "content",
            &query_emb,
            &embeddings,
            &chunks,
            &bm25,
            2,
            Some(0.5),
            None,
        );
        // top_k=2; the semantic best hit (chunk 0) should be present.
        assert!(!results.is_empty());
        assert!(results.iter().any(|(i, _)| *i == 0));
        assert!(results.len() <= 2);
    }

    /// `test:hybrid-zero-bm25-excluded-from-fusion` — BM25 zero scores
    /// don't enter the RRF pool because `search_bm25` drops them.
    #[test]
    fn hybrid_zero_bm25_excluded_from_fusion() {
        let chunks = vec![chunk("src/a.rs", "alpha"), chunk("src/b.rs", "bravo")];
        let bm25 = Bm25Index::build(&chunks);
        // Query "alpha" only matches doc 0 in BM25.
        let bm = search_bm25("alpha", &bm25, 10, None);
        assert_eq!(bm.len(), 1);
        let rrf = rrf_scores(&bm);
        assert!(
            !rrf.contains_key(&1),
            "BM25 zero-score doc should be excluded"
        );
    }

    /// `test:hybrid-applies-rerank-topk` — file-saturation decay applies
    /// when hybrid returns multiple chunks from the same file.
    #[test]
    fn hybrid_applies_rerank_topk() {
        // Two chunks in the same file with identical embeddings will
        // tie in both sub-rankings; rerank_topk applies the 0.5 decay
        // so the second chunk's effective score is half of the first.
        let chunks = vec![
            chunk("src/a.rs", "alpha bravo"),
            chunk("src/a.rs", "alpha bravo"),
        ];
        let embeddings = Array2::from_shape_vec((2, 2), vec![1.0_f32, 0.0, 1.0, 0.0]).unwrap();
        let bm25 = Bm25Index::build(&chunks);
        let query_emb = vec![1.0_f32, 0.0];
        let results = search_hybrid(
            "alpha",
            &query_emb,
            &embeddings,
            &chunks,
            &bm25,
            2,
            Some(0.5),
            None,
        );
        assert_eq!(results.len(), 2);
        // The first hit's score should be strictly greater than the
        // second's (saturation decay).
        assert!(
            results[0].1 > results[1].1,
            "expected saturation decay; got scores={results:?}"
        );
    }

    /// `test:hybrid-applies-query-boost` and
    /// `test:hybrid-applies-multi-chunk-boost` are exercised transitively
    /// by the rerank_topk and boost_multi_chunk_files unit tests in their
    /// respective modules — the wiring in this module is a single call
    /// through each. A non-trivial regression here would require a
    /// behavioural shift in those modules, which their own tests cover.
    #[test]
    fn hybrid_pipeline_wires_through_boosts_and_rerank() {
        // Smoke test: a query that touches a chunk whose file stem matches
        // it should bubble up via the apply_query_boost stem-match path.
        let chunks = vec![
            chunk("src/auth.rs", "fn login() {}"),
            chunk("src/utils.rs", "fn unrelated() {}"),
        ];
        let embeddings = Array2::from_shape_vec((2, 2), vec![1.0_f32, 0.0, 0.0, 1.0]).unwrap();
        let bm25 = Bm25Index::build(&chunks);
        let query_emb = vec![0.0_f32, 0.0]; // unhelpful semantic vector
        let results = search_hybrid(
            "auth",
            &query_emb,
            &embeddings,
            &chunks,
            &bm25,
            2,
            Some(0.5),
            None,
        );
        // The auth.rs chunk should rank first because the stem matches.
        assert!(!results.is_empty());
        let top = results[0].0;
        assert_eq!(top, 0, "expected auth.rs first; got {results:?}");
    }
}