ripvec-core 3.1.0

Semantic code + document search engine. Cacheless static-embedding + cross-encoder rerank by default; optional ModernBERT/BGE transformer engines with GPU backends. Tree-sitter chunking, hybrid BM25 + PageRank, composable ranking layers.
Documentation
//! End-to-end tests for the online reconcile path (v3.1.0+).
//!
//! Builds a real [`RipvecIndex`] over a temporary corpus, then mutates
//! the filesystem and asserts that [`RipvecIndex::diff_against_filesystem`]
//! categorizes the changes correctly. Mirrors the manifest unit tests
//! in `crates/ripvec-core/src/encoder/ripvec/manifest.rs::tests`, but
//! exercises the full integration: walk options captured from
//! `SearchConfig`, real `embed_root`, manifest populated alongside the
//! chunk/embedding build.
//!
//! Gated `#[ignore]` because each run downloads the Model2Vec encoder
//! (~32 MB on first execution). Run with `cargo test --test reconcile
//! -- --ignored` once the model is cached. The same
//! `RIPVEC_SEMBLE_MODEL_PATH` override used by `ripvec_port_parity`
//! works here for offline runs.

use std::fs;
use std::path::{Path, PathBuf};

use ripvec_core::embed::SearchConfig;
use ripvec_core::encoder::ripvec::dense::{DEFAULT_MODEL_REPO, StaticEncoder};
use ripvec_core::encoder::ripvec::index::RipvecIndex;
use ripvec_core::hybrid::SearchMode;
use ripvec_core::profile::Profiler;

fn resolve_model_source() -> String {
    std::env::var("RIPVEC_SEMBLE_MODEL_PATH").unwrap_or_else(|_| DEFAULT_MODEL_REPO.to_string())
}

fn download_lock() -> &'static std::sync::Mutex<()> {
    static M: std::sync::OnceLock<std::sync::Mutex<()>> = std::sync::OnceLock::new();
    M.get_or_init(|| std::sync::Mutex::new(()))
}

fn build_test_corpus(root: &Path) {
    let files: &[(&str, &str)] = &[
        (
            "src/lib.rs",
            "pub fn one() -> u32 { 1 }\npub fn two() -> u32 { 2 }\n",
        ),
        ("src/util.rs", "pub fn helper(x: u32) -> u32 { x + 1 }\n"),
        (
            "README.md",
            "# Test corpus\nAn empty test project for reconcile tests.\n",
        ),
    ];
    for (rel, content) in files {
        let full = root.join(rel);
        if let Some(parent) = full.parent() {
            fs::create_dir_all(parent).unwrap();
        }
        fs::write(&full, content).unwrap();
    }
}

fn load_index(root: &Path) -> RipvecIndex {
    let source = resolve_model_source();
    let guard = download_lock().lock().unwrap();
    let encoder = StaticEncoder::from_pretrained(&source).expect("encoder load");
    drop(guard);
    let cfg = SearchConfig {
        batch_size: 32,
        max_tokens: 512,
        chunk: ripvec_core::chunk::ChunkConfig {
            max_chunk_bytes: 4096,
            window_size: 2048,
            window_overlap: 512,
        },
        text_mode: false,
        cascade_dim: None,
        file_type: None,
        exclude_extensions: Vec::new(),
        include_extensions: Vec::new(),
        ignore_patterns: Vec::new(),
        scope: ripvec_core::embed::Scope::All,
        mode: SearchMode::Hybrid,
    };
    RipvecIndex::from_root(root, encoder, &cfg, &Profiler::noop(), None, 0.0)
        .expect("RipvecIndex build")
}

/// Find a path in the manifest by its filename suffix (handles
/// canonicalization differences between tmpdir paths and the manifest's
/// stored absolute paths).
fn manifest_path_for(index: &RipvecIndex, filename: &str) -> Option<PathBuf> {
    index
        .manifest()
        .files
        .keys()
        .find(|p| p.ends_with(filename))
        .cloned()
}

/// Initial build must populate the manifest with one entry per walked
/// file. The chunks vec and the manifest must agree on which files
/// were indexed.
#[test]
#[ignore = "requires Model2Vec download (~32 MB on first run)"]
fn manifest_populated_at_build_time() {
    let tmp = tempfile::TempDir::new().unwrap();
    build_test_corpus(tmp.path());
    let index = load_index(tmp.path());

    let manifest = index.manifest();
    assert_eq!(
        manifest.len(),
        3,
        "manifest should track all 3 corpus files; got {}",
        manifest.len()
    );

    // Cross-check: every file_path that appears in chunks must also
    // appear in the manifest.
    let chunk_files: std::collections::HashSet<&str> = index
        .chunks()
        .iter()
        .map(|c| c.file_path.as_str())
        .collect();
    for chunk_file in chunk_files {
        let exists_in_manifest = manifest
            .files
            .keys()
            .any(|p| p.to_string_lossy().ends_with(chunk_file));
        assert!(
            exists_in_manifest,
            "chunk file {chunk_file:?} must also exist in manifest"
        );
    }
}

/// A freshly-built index against an unchanged filesystem reports zero
/// diff — the no-work path.
#[test]
#[ignore = "requires Model2Vec download (~32 MB on first run)"]
fn diff_empty_immediately_after_build() {
    let tmp = tempfile::TempDir::new().unwrap();
    build_test_corpus(tmp.path());
    let index = load_index(tmp.path());

    let diff = index.diff_against_filesystem();
    assert!(
        diff.is_empty(),
        "fresh index against unchanged FS must yield empty diff; got dirty={} new={} deleted={}",
        diff.dirty.len(),
        diff.new.len(),
        diff.deleted.len()
    );
}

/// Adding a new file to the corpus after the index is built must show
/// up as `new` in the next diff.
#[test]
#[ignore = "requires Model2Vec download (~32 MB on first run)"]
fn diff_detects_added_file() {
    let tmp = tempfile::TempDir::new().unwrap();
    build_test_corpus(tmp.path());
    let index = load_index(tmp.path());

    let new_path = tmp.path().join("src/added.rs");
    fs::write(&new_path, "pub fn fresh() {}\n").unwrap();

    let diff = index.diff_against_filesystem();
    assert!(
        diff.dirty.is_empty(),
        "no dirty expected; got {:?}",
        diff.dirty
    );
    assert!(
        diff.deleted.is_empty(),
        "no deleted expected; got {:?}",
        diff.deleted
    );
    assert_eq!(
        diff.new.len(),
        1,
        "added.rs must appear in new; got {:?}",
        diff.new
    );
    assert!(
        diff.new[0].ends_with("src/added.rs"),
        "new path {:?} must end with src/added.rs",
        diff.new[0]
    );
}

/// Removing a file from the corpus after the index is built must show
/// up as `deleted`.
#[test]
#[ignore = "requires Model2Vec download (~32 MB on first run)"]
fn diff_detects_deleted_file() {
    let tmp = tempfile::TempDir::new().unwrap();
    build_test_corpus(tmp.path());
    let index = load_index(tmp.path());

    let util = manifest_path_for(&index, "src/util.rs").expect("util.rs in manifest");
    fs::remove_file(&util).unwrap();

    let diff = index.diff_against_filesystem();
    assert!(diff.dirty.is_empty());
    assert!(diff.new.is_empty());
    assert_eq!(diff.deleted.len(), 1);
    assert!(diff.deleted[0].ends_with("src/util.rs"));
}

/// Editing a file's content (real change, not just a touch) must show
/// up as `dirty`.
#[test]
#[ignore = "requires Model2Vec download (~32 MB on first run)"]
fn diff_detects_real_content_change() {
    let tmp = tempfile::TempDir::new().unwrap();
    build_test_corpus(tmp.path());
    let index = load_index(tmp.path());

    let util = manifest_path_for(&index, "src/util.rs").expect("util.rs in manifest");
    std::thread::sleep(std::time::Duration::from_millis(20));
    fs::write(&util, "pub fn helper(x: u32) -> u32 { x * 2 }\n").unwrap();

    let diff = index.diff_against_filesystem();
    assert!(diff.new.is_empty(), "no new expected; got {:?}", diff.new);
    assert!(
        diff.deleted.is_empty(),
        "no deleted; got {:?}",
        diff.deleted
    );
    assert_eq!(diff.dirty.len(), 1, "util.rs edit must be dirty");
    assert!(diff.dirty[0].ends_with("src/util.rs"));
}

/// Rewriting a file with identical content (vim save-no-edit, formatter
/// hashed-equal output) must NOT appear in the diff — the blake3
/// verification short-circuits the stat-tuple mismatch.
#[test]
#[ignore = "requires Model2Vec download (~32 MB on first run)"]
fn diff_ignores_touched_but_unchanged() {
    let tmp = tempfile::TempDir::new().unwrap();
    build_test_corpus(tmp.path());
    let index = load_index(tmp.path());

    let util = manifest_path_for(&index, "src/util.rs").expect("util.rs in manifest");
    let original = fs::read_to_string(&util).unwrap();
    std::thread::sleep(std::time::Duration::from_millis(20));
    // Rewrite with the same bytes — mtime updates, content identical
    fs::write(&util, original).unwrap();

    let diff = index.diff_against_filesystem();
    assert!(
        diff.is_empty(),
        "touch-with-same-content must yield empty diff; got dirty={:?} new={:?} deleted={:?}",
        diff.dirty,
        diff.new,
        diff.deleted
    );
}

/// Add + edit + delete in one cycle — diff must categorize each
/// correctly.
#[test]
#[ignore = "requires Model2Vec download (~32 MB on first run)"]
fn diff_handles_simultaneous_add_edit_delete() {
    let tmp = tempfile::TempDir::new().unwrap();
    build_test_corpus(tmp.path());
    let index = load_index(tmp.path());

    let lib = manifest_path_for(&index, "src/lib.rs").expect("lib.rs in manifest");
    let util = manifest_path_for(&index, "src/util.rs").expect("util.rs in manifest");

    std::thread::sleep(std::time::Duration::from_millis(20));
    fs::write(&lib, "pub fn renamed() -> u32 { 99 }\n").unwrap(); // edit
    fs::remove_file(&util).unwrap(); // delete
    fs::write(tmp.path().join("src/added.rs"), "pub fn novel() {}\n").unwrap(); // add

    let diff = index.diff_against_filesystem();
    assert_eq!(diff.dirty.len(), 1, "expected 1 dirty (lib.rs)");
    assert!(diff.dirty[0].ends_with("src/lib.rs"));
    assert_eq!(diff.deleted.len(), 1, "expected 1 deleted (util.rs)");
    assert!(diff.deleted[0].ends_with("src/util.rs"));
    assert_eq!(diff.new.len(), 1, "expected 1 new (added.rs)");
    assert!(diff.new[0].ends_with("src/added.rs"));
    assert_eq!(diff.total(), 3);
}

/// Walk options captured at build time must be honored on reconcile —
/// excluded files don't appear as `new` even if they're added during
/// the test.
#[test]
#[ignore = "requires Model2Vec download (~32 MB on first run)"]
fn diff_honors_walk_options_for_added_files() {
    let tmp = tempfile::TempDir::new().unwrap();
    build_test_corpus(tmp.path());

    // Build with .json excluded
    let source = resolve_model_source();
    let guard = download_lock().lock().unwrap();
    let encoder = StaticEncoder::from_pretrained(&source).expect("encoder load");
    drop(guard);
    let cfg = SearchConfig {
        batch_size: 32,
        max_tokens: 512,
        chunk: ripvec_core::chunk::ChunkConfig {
            max_chunk_bytes: 4096,
            window_size: 2048,
            window_overlap: 512,
        },
        text_mode: false,
        cascade_dim: None,
        file_type: None,
        exclude_extensions: vec!["json".to_string()],
        include_extensions: Vec::new(),
        ignore_patterns: Vec::new(),
        scope: ripvec_core::embed::Scope::All,
        mode: SearchMode::Hybrid,
    };
    let index = RipvecIndex::from_root(tmp.path(), encoder, &cfg, &Profiler::noop(), None, 0.0)
        .expect("build");

    // Add a .json file — should be filtered by the captured walk options
    fs::write(tmp.path().join("data.json"), "{\"x\": 1}\n").unwrap();
    // Also add a .rs file — should be detected
    fs::write(tmp.path().join("src/included.rs"), "fn x() {}\n").unwrap();

    let diff = index.diff_against_filesystem();
    assert!(
        diff.new.iter().all(|p| !p.ends_with("data.json")),
        "excluded .json must not appear in diff.new: {:?}",
        diff.new
    );
    assert!(
        diff.new.iter().any(|p| p.ends_with("src/included.rs")),
        "included .rs must appear in diff.new: {:?}",
        diff.new
    );
}