ripvec-core 3.0.2

Semantic code + document search engine. Cacheless static-embedding + cross-encoder rerank by default; optional ModernBERT/BGE transformer engines with GPU backends. Tree-sitter chunking, hybrid BM25 + PageRank, composable ranking layers.
Documentation
//! End-to-end parity test for the semble retrieval port.
//!
//! Builds a fixed in-memory corpus, runs queries through `RipvecIndex`,
//! and asserts top-k properties that mirror what semble's Python
//! implementation produces on the same inputs. Lifted from the
//! reference behaviours documented in `~/src/semble/tests/`.
//!
//! Gated `#[ignore]` because each run downloads `minishlab/potion-code-16M`
//! (~16 MB) on first execution. Run with `cargo test --test
//! ripvec_port_parity -- --ignored` once the model is cached.

use std::fs;
use std::path::PathBuf;

use ripvec_core::embed::SearchConfig;
use ripvec_core::encoder::ripvec::dense::{DEFAULT_MODEL_REPO, StaticEncoder};
use ripvec_core::encoder::ripvec::index::RipvecIndex;
use ripvec_core::hybrid::SearchMode;
use ripvec_core::profile::Profiler;

/// Build a minimal test corpus on disk and return its root path.
///
/// The corpus contains five files spread across two languages,
/// designed so that:
/// - "authentication" (NL query) should match `auth.py`'s top chunk.
/// - "AuthService" (symbol query) should match `auth_service.rs`'s
///   struct definition (stem-match + symbol-def boost).
/// - "parser" (stem-prefix NL query) should match `parser.rs`.
/// - Test-file paths (`tests/test_auth.py`) carry the penalty.
fn build_test_corpus(tmp: &tempfile::TempDir) -> PathBuf {
    let root = tmp.path();
    let files: &[(&str, &str)] = &[
        (
            "src/auth.py",
            "def authenticate_user(token: str) -> bool:\n    return verify_token(token)\n\n\
             def verify_token(token: str) -> bool:\n    return token.startswith('valid')\n",
        ),
        (
            "src/auth_service.rs",
            "pub struct AuthService { secret: String }\n\
             impl AuthService {\n    pub fn verify(&self, token: &str) -> bool { token == self.secret }\n}\n",
        ),
        (
            "src/parser.rs",
            "pub fn parse_json(input: &str) -> Result<Value, Error> {\n    serde_json::from_str(input)\n}\n",
        ),
        (
            "src/utils.rs",
            "pub fn unrelated_utility() {\n    println!(\"hello\")\n}\n",
        ),
        (
            "tests/test_auth.py",
            "def test_authenticate_user():\n    assert authenticate_user('valid_token')\n",
        ),
    ];
    for (rel, content) in files {
        let full = root.join(rel);
        if let Some(parent) = full.parent() {
            fs::create_dir_all(parent).expect("mkdir failed");
        }
        fs::write(&full, content).expect("write failed");
    }
    root.to_path_buf()
}

/// Resolve the model source: prefer `RIPVEC_SEMBLE_MODEL_PATH` when set
/// so contributors on firewalled networks can pre-download via curl and
/// point the tests at a local directory. Falls back to the HF Hub repo
/// when the env var is unset (network required).
fn resolve_model_source() -> String {
    std::env::var("RIPVEC_SEMBLE_MODEL_PATH").unwrap_or_else(|_| DEFAULT_MODEL_REPO.to_string())
}

/// Serialize the first download under a process-wide Mutex.
///
/// `cargo test` runs test fns in parallel by default, and six tests
/// racing to download the same `.safetensors` blob hit `hf-hub`'s lock
/// file with "Lock acquisition failed" errors. Funnel the first call
/// through a Mutex so only one downloader runs; subsequent callers
/// hit the populated HF cache and return instantly.
fn download_lock() -> &'static std::sync::Mutex<()> {
    static M: std::sync::OnceLock<std::sync::Mutex<()>> = std::sync::OnceLock::new();
    M.get_or_init(|| std::sync::Mutex::new(()))
}

fn load_index(root: &std::path::Path) -> RipvecIndex {
    let source = resolve_model_source();
    let guard = download_lock().lock().expect("download mutex poisoned");
    let encoder = StaticEncoder::from_pretrained(&source).unwrap_or_else(|e| {
        panic!(
            "model2vec load failed for source {source:?}: {e}.\n\
             If the HF Hub is unreachable from this network, pre-download:\n  \
             mkdir -p /tmp/potion-code-16M && \\\n  \
             for f in config.json tokenizer.json model.safetensors; do \\\n    \
               curl -sL -o \"/tmp/potion-code-16M/$f\" \\\n      \
                 \"https://huggingface.co/minishlab/potion-code-16M/resolve/main/$f\"; \\\n  \
             done\n\
             then re-run with: RIPVEC_SEMBLE_MODEL_PATH=/tmp/potion-code-16M cargo test \\\n  \
                 --test ripvec_port_parity -- --ignored"
        )
    });
    drop(guard);
    let cfg = SearchConfig {
        batch_size: 32,
        max_tokens: 512,
        chunk: ripvec_core::chunk::ChunkConfig {
            max_chunk_bytes: 4096,
            window_size: 2048,
            window_overlap: 512,
        },
        text_mode: false,
        cascade_dim: None,
        file_type: None,
        exclude_extensions: Vec::new(),
        include_extensions: Vec::new(),
        ignore_patterns: Vec::new(),
        scope: ripvec_core::embed::Scope::All,
        mode: SearchMode::Hybrid,
    };
    let profiler = Profiler::noop();
    RipvecIndex::from_root(root, encoder, &cfg, &profiler, None, 0.0)
        .expect("RipvecIndex build should succeed")
}

/// Top-k hybrid query: the "authentication" query should rank
/// `auth.py` content first (semantic + stem-overlap signals align).
/// Mirrors semble's behaviour where NL queries with strong content
/// alignment lead via the semantic side.
#[test]
#[ignore = "requires model2vec download (~16 MB on first run)"]
fn parity_nl_query_authentication_finds_auth_py() {
    let tmp = tempfile::TempDir::new().unwrap();
    let root = build_test_corpus(&tmp);
    let index = load_index(&root);
    let results = index.search("authentication", 5, SearchMode::Hybrid, None, None, None);

    assert!(!results.is_empty(), "expected non-empty results");
    let chunks = index.chunks();
    let top_path = &chunks[results[0].0].file_path;
    assert!(
        top_path.contains("auth.py") || top_path.contains("auth_service.rs"),
        "expected auth.py or auth_service.rs first; got {top_path}"
    );
}

/// `AuthService` is a symbol query: stem-match boost on
/// `auth_service.rs` plus symbol-def boost on the `pub struct
/// AuthService` line. Should rank that file first.
#[test]
#[ignore = "requires model2vec download (~16 MB on first run)"]
fn parity_symbol_query_authservice_finds_definition() {
    let tmp = tempfile::TempDir::new().unwrap();
    let root = build_test_corpus(&tmp);
    let index = load_index(&root);
    let results = index.search("AuthService", 5, SearchMode::Hybrid, None, None, None);

    assert!(!results.is_empty(), "expected non-empty results");
    let chunks = index.chunks();
    let top_path = &chunks[results[0].0].file_path;
    assert!(
        top_path.contains("auth_service.rs"),
        "expected auth_service.rs first; got {top_path}"
    );
}

/// "parser" should match `parser.rs` via stem-prefix overlap.
#[test]
#[ignore = "requires model2vec download (~16 MB on first run)"]
fn parity_stem_query_parser_finds_parser_rs() {
    let tmp = tempfile::TempDir::new().unwrap();
    let root = build_test_corpus(&tmp);
    let index = load_index(&root);
    let results = index.search("parse json", 5, SearchMode::Hybrid, None, None, None);

    assert!(!results.is_empty(), "expected non-empty results");
    let chunks = index.chunks();
    let top_path = &chunks[results[0].0].file_path;
    assert!(
        top_path.contains("parser.rs"),
        "expected parser.rs first; got {top_path}"
    );
}

/// Test-file penalty: the same query that matches `tests/test_auth.py`
/// (which contains "authenticate_user") should never rank the test
/// file above the production source `src/auth.py`. Strong penalty
/// (0.3x) should keep tests below.
#[test]
#[ignore = "requires model2vec download (~16 MB on first run)"]
fn parity_test_file_penalty_keeps_tests_below_source() {
    let tmp = tempfile::TempDir::new().unwrap();
    let root = build_test_corpus(&tmp);
    let index = load_index(&root);
    let results = index.search("authenticate_user", 5, SearchMode::Hybrid, None, None, None);

    let chunks = index.chunks();
    let mut src_rank: Option<usize> = None;
    let mut test_rank: Option<usize> = None;
    for (rank, (idx, _)) in results.iter().enumerate() {
        let path = &chunks[*idx].file_path;
        if path.contains("src/auth.py") && src_rank.is_none() {
            src_rank = Some(rank);
        }
        if path.contains("tests/test_auth.py") && test_rank.is_none() {
            test_rank = Some(rank);
        }
    }
    if let (Some(s), Some(t)) = (src_rank, test_rank) {
        assert!(
            s < t,
            "src/auth.py (rank {s}) should rank above tests/test_auth.py (rank {t})"
        );
    } else if test_rank.is_some() && src_rank.is_none() {
        panic!("test file present in results but source file absent");
    }
}

/// Semantic-only search bypasses path penalties (per semble's
/// `penalise_paths = alpha < 1.0` convention). Mode-mode parity check.
#[test]
#[ignore = "requires model2vec download (~16 MB on first run)"]
fn parity_semantic_only_mode_returns_dense_top_k() {
    let tmp = tempfile::TempDir::new().unwrap();
    let root = build_test_corpus(&tmp);
    let index = load_index(&root);
    let results = index.search("verify token", 3, SearchMode::Semantic, None, None, None);
    assert!(
        !results.is_empty(),
        "expected non-empty semantic-only results"
    );
    // Semantic mode should still rank an auth file high because the
    // query semantically aligns with the auth corpus.
    let chunks = index.chunks();
    let top_path = &chunks[results[0].0].file_path;
    assert!(
        top_path.contains("auth"),
        "expected an auth-related file first under semantic mode; got {top_path}"
    );
}

/// Keyword-only (BM25) mode for the symbol query should rank
/// `auth_service.rs` first via the stem-doubled enrichment.
#[test]
#[ignore = "requires model2vec download (~16 MB on first run)"]
fn parity_keyword_only_mode_finds_path_via_enrichment() {
    let tmp = tempfile::TempDir::new().unwrap();
    let root = build_test_corpus(&tmp);
    let index = load_index(&root);
    // Use the file stem as the query so BM25's path-enrichment is the
    // dominant signal.
    let results = index.search("auth_service", 5, SearchMode::Keyword, None, None, None);
    assert!(!results.is_empty(), "expected non-empty BM25-only results");
    let chunks = index.chunks();
    let top_path = &chunks[results[0].0].file_path;
    assert!(
        top_path.contains("auth_service.rs"),
        "expected auth_service.rs first under keyword mode; got {top_path}"
    );
}