use std::fs;
use std::path::{Path, PathBuf};
use ripvec_core::embed::SearchConfig;
use ripvec_core::encoder::ripvec::dense::{DEFAULT_MODEL_REPO, StaticEncoder};
use ripvec_core::encoder::ripvec::index::RipvecIndex;
use ripvec_core::hybrid::SearchMode;
use ripvec_core::profile::Profiler;
fn resolve_model_source() -> String {
std::env::var("RIPVEC_SEMBLE_MODEL_PATH").unwrap_or_else(|_| DEFAULT_MODEL_REPO.to_string())
}
fn download_lock() -> &'static std::sync::Mutex<()> {
static M: std::sync::OnceLock<std::sync::Mutex<()>> = std::sync::OnceLock::new();
M.get_or_init(|| std::sync::Mutex::new(()))
}
fn build_test_corpus(root: &Path) {
let files: &[(&str, &str)] = &[
(
"src/lib.rs",
"pub fn one() -> u32 { 1 }\npub fn two() -> u32 { 2 }\n",
),
("src/util.rs", "pub fn helper(x: u32) -> u32 { x + 1 }\n"),
(
"README.md",
"# Test corpus\nAn empty test project for reconcile tests.\n",
),
];
for (rel, content) in files {
let full = root.join(rel);
if let Some(parent) = full.parent() {
fs::create_dir_all(parent).unwrap();
}
fs::write(&full, content).unwrap();
}
}
fn load_index(root: &Path) -> RipvecIndex {
let source = resolve_model_source();
let guard = download_lock().lock().unwrap();
let encoder = StaticEncoder::from_pretrained(&source).expect("encoder load");
drop(guard);
let cfg = SearchConfig {
batch_size: 32,
max_tokens: 512,
chunk: ripvec_core::chunk::ChunkConfig {
max_chunk_bytes: 4096,
window_size: 2048,
window_overlap: 512,
},
text_mode: false,
cascade_dim: None,
file_type: None,
exclude_extensions: Vec::new(),
include_extensions: Vec::new(),
ignore_patterns: Vec::new(),
scope: ripvec_core::embed::Scope::All,
mode: SearchMode::Hybrid,
};
RipvecIndex::from_root(root, encoder, &cfg, &Profiler::noop(), None, 0.0)
.expect("RipvecIndex build")
}
fn manifest_path_for(index: &RipvecIndex, filename: &str) -> Option<PathBuf> {
index
.manifest()
.files
.keys()
.find(|p| p.ends_with(filename))
.cloned()
}
#[test]
#[ignore = "requires Model2Vec download (~32 MB on first run)"]
fn manifest_populated_at_build_time() {
let tmp = tempfile::TempDir::new().unwrap();
build_test_corpus(tmp.path());
let index = load_index(tmp.path());
let manifest = index.manifest();
assert_eq!(
manifest.len(),
3,
"manifest should track all 3 corpus files; got {}",
manifest.len()
);
let chunk_files: std::collections::HashSet<&str> = index
.chunks()
.iter()
.map(|c| c.file_path.as_str())
.collect();
for chunk_file in chunk_files {
let exists_in_manifest = manifest
.files
.keys()
.any(|p| p.to_string_lossy().ends_with(chunk_file));
assert!(
exists_in_manifest,
"chunk file {chunk_file:?} must also exist in manifest"
);
}
}
#[test]
#[ignore = "requires Model2Vec download (~32 MB on first run)"]
fn diff_empty_immediately_after_build() {
let tmp = tempfile::TempDir::new().unwrap();
build_test_corpus(tmp.path());
let index = load_index(tmp.path());
let diff = index.diff_against_filesystem();
assert!(
diff.is_empty(),
"fresh index against unchanged FS must yield empty diff; got dirty={} new={} deleted={}",
diff.dirty.len(),
diff.new.len(),
diff.deleted.len()
);
}
#[test]
#[ignore = "requires Model2Vec download (~32 MB on first run)"]
fn diff_detects_added_file() {
let tmp = tempfile::TempDir::new().unwrap();
build_test_corpus(tmp.path());
let index = load_index(tmp.path());
let new_path = tmp.path().join("src/added.rs");
fs::write(&new_path, "pub fn fresh() {}\n").unwrap();
let diff = index.diff_against_filesystem();
assert!(
diff.dirty.is_empty(),
"no dirty expected; got {:?}",
diff.dirty
);
assert!(
diff.deleted.is_empty(),
"no deleted expected; got {:?}",
diff.deleted
);
assert_eq!(
diff.new.len(),
1,
"added.rs must appear in new; got {:?}",
diff.new
);
assert!(
diff.new[0].ends_with("src/added.rs"),
"new path {:?} must end with src/added.rs",
diff.new[0]
);
}
#[test]
#[ignore = "requires Model2Vec download (~32 MB on first run)"]
fn diff_detects_deleted_file() {
let tmp = tempfile::TempDir::new().unwrap();
build_test_corpus(tmp.path());
let index = load_index(tmp.path());
let util = manifest_path_for(&index, "src/util.rs").expect("util.rs in manifest");
fs::remove_file(&util).unwrap();
let diff = index.diff_against_filesystem();
assert!(diff.dirty.is_empty());
assert!(diff.new.is_empty());
assert_eq!(diff.deleted.len(), 1);
assert!(diff.deleted[0].ends_with("src/util.rs"));
}
#[test]
#[ignore = "requires Model2Vec download (~32 MB on first run)"]
fn diff_detects_real_content_change() {
let tmp = tempfile::TempDir::new().unwrap();
build_test_corpus(tmp.path());
let index = load_index(tmp.path());
let util = manifest_path_for(&index, "src/util.rs").expect("util.rs in manifest");
std::thread::sleep(std::time::Duration::from_millis(20));
fs::write(&util, "pub fn helper(x: u32) -> u32 { x * 2 }\n").unwrap();
let diff = index.diff_against_filesystem();
assert!(diff.new.is_empty(), "no new expected; got {:?}", diff.new);
assert!(
diff.deleted.is_empty(),
"no deleted; got {:?}",
diff.deleted
);
assert_eq!(diff.dirty.len(), 1, "util.rs edit must be dirty");
assert!(diff.dirty[0].ends_with("src/util.rs"));
}
#[test]
#[ignore = "requires Model2Vec download (~32 MB on first run)"]
fn diff_ignores_touched_but_unchanged() {
let tmp = tempfile::TempDir::new().unwrap();
build_test_corpus(tmp.path());
let index = load_index(tmp.path());
let util = manifest_path_for(&index, "src/util.rs").expect("util.rs in manifest");
let original = fs::read_to_string(&util).unwrap();
std::thread::sleep(std::time::Duration::from_millis(20));
fs::write(&util, original).unwrap();
let diff = index.diff_against_filesystem();
assert!(
diff.is_empty(),
"touch-with-same-content must yield empty diff; got dirty={:?} new={:?} deleted={:?}",
diff.dirty,
diff.new,
diff.deleted
);
}
#[test]
#[ignore = "requires Model2Vec download (~32 MB on first run)"]
fn diff_handles_simultaneous_add_edit_delete() {
let tmp = tempfile::TempDir::new().unwrap();
build_test_corpus(tmp.path());
let index = load_index(tmp.path());
let lib = manifest_path_for(&index, "src/lib.rs").expect("lib.rs in manifest");
let util = manifest_path_for(&index, "src/util.rs").expect("util.rs in manifest");
std::thread::sleep(std::time::Duration::from_millis(20));
fs::write(&lib, "pub fn renamed() -> u32 { 99 }\n").unwrap(); fs::remove_file(&util).unwrap(); fs::write(tmp.path().join("src/added.rs"), "pub fn novel() {}\n").unwrap();
let diff = index.diff_against_filesystem();
assert_eq!(diff.dirty.len(), 1, "expected 1 dirty (lib.rs)");
assert!(diff.dirty[0].ends_with("src/lib.rs"));
assert_eq!(diff.deleted.len(), 1, "expected 1 deleted (util.rs)");
assert!(diff.deleted[0].ends_with("src/util.rs"));
assert_eq!(diff.new.len(), 1, "expected 1 new (added.rs)");
assert!(diff.new[0].ends_with("src/added.rs"));
assert_eq!(diff.total(), 3);
}
#[test]
#[ignore = "requires Model2Vec download (~32 MB on first run)"]
fn diff_honors_walk_options_for_added_files() {
let tmp = tempfile::TempDir::new().unwrap();
build_test_corpus(tmp.path());
let source = resolve_model_source();
let guard = download_lock().lock().unwrap();
let encoder = StaticEncoder::from_pretrained(&source).expect("encoder load");
drop(guard);
let cfg = SearchConfig {
batch_size: 32,
max_tokens: 512,
chunk: ripvec_core::chunk::ChunkConfig {
max_chunk_bytes: 4096,
window_size: 2048,
window_overlap: 512,
},
text_mode: false,
cascade_dim: None,
file_type: None,
exclude_extensions: vec!["json".to_string()],
include_extensions: Vec::new(),
ignore_patterns: Vec::new(),
scope: ripvec_core::embed::Scope::All,
mode: SearchMode::Hybrid,
};
let index = RipvecIndex::from_root(tmp.path(), encoder, &cfg, &Profiler::noop(), None, 0.0)
.expect("build");
fs::write(tmp.path().join("data.json"), "{\"x\": 1}\n").unwrap();
fs::write(tmp.path().join("src/included.rs"), "fn x() {}\n").unwrap();
let diff = index.diff_against_filesystem();
assert!(
diff.new.iter().all(|p| !p.ends_with("data.json")),
"excluded .json must not appear in diff.new: {:?}",
diff.new
);
assert!(
diff.new.iter().any(|p| p.ends_with("src/included.rs")),
"included .rs must appear in diff.new: {:?}",
diff.new
);
}
#[test]
#[ignore = "requires Model2Vec download (~32 MB on first run)"]
fn apply_diff_adds_new_file() {
let tmp = tempfile::TempDir::new().unwrap();
build_test_corpus(tmp.path());
let index = load_index(tmp.path());
let original_chunks = index.chunks().len();
let original_manifest = index.manifest().len();
fs::write(
tmp.path().join("src/added.rs"),
"pub fn newly_introduced() -> u32 { 42 }\n",
)
.unwrap();
let diff = index.diff_against_filesystem();
let updated = index
.apply_diff(&diff, &Profiler::noop())
.expect("apply_diff");
assert!(
updated.chunks().len() > original_chunks,
"added file must produce additional chunks"
);
assert_eq!(
updated.manifest().len(),
original_manifest + 1,
"manifest must gain the new entry"
);
assert!(
updated
.chunks()
.iter()
.any(|c| c.content.contains("newly_introduced")),
"new file's chunk content must appear in updated index"
);
assert_eq!(
updated.embeddings().nrows(),
updated.chunks().len(),
"embeddings row count must match chunks count"
);
}
#[test]
#[ignore = "requires Model2Vec download (~32 MB on first run)"]
fn apply_diff_drops_deleted_file() {
let tmp = tempfile::TempDir::new().unwrap();
build_test_corpus(tmp.path());
let index = load_index(tmp.path());
let original_chunks = index.chunks().len();
let original_manifest = index.manifest().len();
let util = manifest_path_for(&index, "src/util.rs").expect("util.rs in manifest");
fs::remove_file(&util).unwrap();
let diff = index.diff_against_filesystem();
let updated = index
.apply_diff(&diff, &Profiler::noop())
.expect("apply_diff");
assert!(
updated.chunks().len() < original_chunks,
"deleted file must remove chunks"
);
assert_eq!(
updated.manifest().len(),
original_manifest - 1,
"manifest must lose the deleted entry"
);
assert!(
!updated
.chunks()
.iter()
.any(|c| c.file_path.ends_with("util.rs")),
"no chunks from the deleted file should remain"
);
assert_eq!(
updated.embeddings().nrows(),
updated.chunks().len(),
"embeddings row count must match chunks count after delete"
);
}
#[test]
#[ignore = "requires Model2Vec download (~32 MB on first run)"]
fn apply_diff_replaces_dirty_file_content() {
let tmp = tempfile::TempDir::new().unwrap();
build_test_corpus(tmp.path());
let index = load_index(tmp.path());
let util = manifest_path_for(&index, "src/util.rs").expect("util.rs in manifest");
std::thread::sleep(std::time::Duration::from_millis(20));
fs::write(
&util,
"pub fn brand_new_function_name(x: u32) -> u32 { x * 99 }\n",
)
.unwrap();
let diff = index.diff_against_filesystem();
assert_eq!(diff.dirty.len(), 1, "test setup: one dirty file");
let updated = index
.apply_diff(&diff, &Profiler::noop())
.expect("apply_diff");
let util_chunks: Vec<&ripvec_core::chunk::CodeChunk> = updated
.chunks()
.iter()
.filter(|c| c.file_path.ends_with("util.rs"))
.collect();
assert!(
!util_chunks.is_empty(),
"util.rs must still have chunks after dirty rewrite"
);
assert!(
util_chunks
.iter()
.any(|c| c.content.contains("brand_new_function_name")),
"new identifier must appear in util.rs chunks: {:?}",
util_chunks
.iter()
.map(|c| c.content.as_str())
.collect::<Vec<_>>()
);
assert!(
!util_chunks.iter().any(|c| c.content.contains("helper")),
"old identifier must NOT appear in util.rs chunks"
);
assert!(
updated
.chunks()
.iter()
.any(|c| c.file_path.ends_with("lib.rs")),
"untouched lib.rs chunks must survive apply_diff"
);
assert_eq!(
updated.embeddings().nrows(),
updated.chunks().len(),
"embeddings row count must match chunks count after dirty rewrite"
);
}
#[test]
#[ignore = "requires Model2Vec download (~32 MB on first run)"]
fn apply_diff_handles_multi_category_diff() {
let tmp = tempfile::TempDir::new().unwrap();
build_test_corpus(tmp.path());
let index = load_index(tmp.path());
let lib_path = manifest_path_for(&index, "src/lib.rs").expect("lib.rs");
let util_path = manifest_path_for(&index, "src/util.rs").expect("util.rs");
std::thread::sleep(std::time::Duration::from_millis(20));
fs::write(&lib_path, "pub fn renamed_one() -> u32 { 1 }\n").unwrap();
fs::remove_file(&util_path).unwrap();
fs::write(
tmp.path().join("src/added.rs"),
"pub fn novel_function() {}\n",
)
.unwrap();
let diff = index.diff_against_filesystem();
assert_eq!(diff.total(), 3, "test setup: three changes");
let updated = index
.apply_diff(&diff, &Profiler::noop())
.expect("apply_diff");
assert!(
updated
.chunks()
.iter()
.any(|c| c.content.contains("renamed_one")),
"dirty file's new content must appear"
);
assert!(
!updated
.chunks()
.iter()
.any(|c| c.file_path.ends_with("util.rs")),
"deleted file's chunks must be gone"
);
assert!(
updated
.chunks()
.iter()
.any(|c| c.content.contains("novel_function")),
"new file's content must appear"
);
assert_eq!(
updated.embeddings().nrows(),
updated.chunks().len(),
"embeddings row count must match chunks count"
);
assert_eq!(
updated.manifest().len(),
index.manifest().len(),
"net manifest size: -1 deleted + 1 added = 0 delta"
);
}
#[test]
#[ignore = "requires Model2Vec download (~32 MB on first run)"]
fn apply_diff_produces_searchable_index() {
let tmp = tempfile::TempDir::new().unwrap();
build_test_corpus(tmp.path());
let index = load_index(tmp.path());
fs::write(
tmp.path().join("src/distinctive.rs"),
"pub fn xylophone_unique_marker() -> u32 { 7 }\n",
)
.unwrap();
let diff = index.diff_against_filesystem();
let updated = index
.apply_diff(&diff, &Profiler::noop())
.expect("apply_diff");
let results = updated.search(
"xylophone_unique_marker",
5,
SearchMode::Keyword,
None,
None,
None,
);
assert!(
!results.is_empty(),
"BM25 keyword search must find the newly-added identifier"
);
let chunks = updated.chunks();
let top = &chunks[results[0].0];
assert!(
top.file_path.ends_with("distinctive.rs"),
"top hit for distinctive identifier must be the new file; got {:?}",
top.file_path
);
}
#[test]
#[ignore = "requires Model2Vec download (~32 MB on first run)"]
fn apply_diff_with_empty_diff_is_noop() {
let tmp = tempfile::TempDir::new().unwrap();
build_test_corpus(tmp.path());
let index = load_index(tmp.path());
let original_chunks = index.chunks().len();
let original_manifest = index.manifest().len();
let empty_diff = index.diff_against_filesystem();
assert!(empty_diff.is_empty(), "fresh index must produce empty diff");
let same = index
.apply_diff(&empty_diff, &Profiler::noop())
.expect("apply_diff on empty diff");
assert_eq!(same.chunks().len(), original_chunks);
assert_eq!(same.manifest().len(), original_manifest);
}