use std::fs;
use std::path::PathBuf;
use ripvec_core::embed::SearchConfig;
use ripvec_core::encoder::ripvec::dense::{DEFAULT_MODEL_REPO, StaticEncoder};
use ripvec_core::encoder::ripvec::index::RipvecIndex;
use ripvec_core::hybrid::SearchMode;
use ripvec_core::profile::Profiler;
fn build_test_corpus(tmp: &tempfile::TempDir) -> PathBuf {
let root = tmp.path();
let files: &[(&str, &str)] = &[
(
"src/auth.py",
"def authenticate_user(token: str) -> bool:\n return verify_token(token)\n\n\
def verify_token(token: str) -> bool:\n return token.startswith('valid')\n",
),
(
"src/auth_service.rs",
"pub struct AuthService { secret: String }\n\
impl AuthService {\n pub fn verify(&self, token: &str) -> bool { token == self.secret }\n}\n",
),
(
"src/parser.rs",
"pub fn parse_json(input: &str) -> Result<Value, Error> {\n serde_json::from_str(input)\n}\n",
),
(
"src/utils.rs",
"pub fn unrelated_utility() {\n println!(\"hello\")\n}\n",
),
(
"tests/test_auth.py",
"def test_authenticate_user():\n assert authenticate_user('valid_token')\n",
),
];
for (rel, content) in files {
let full = root.join(rel);
if let Some(parent) = full.parent() {
fs::create_dir_all(parent).expect("mkdir failed");
}
fs::write(&full, content).expect("write failed");
}
root.to_path_buf()
}
fn resolve_model_source() -> String {
std::env::var("RIPVEC_SEMBLE_MODEL_PATH").unwrap_or_else(|_| DEFAULT_MODEL_REPO.to_string())
}
fn download_lock() -> &'static std::sync::Mutex<()> {
static M: std::sync::OnceLock<std::sync::Mutex<()>> = std::sync::OnceLock::new();
M.get_or_init(|| std::sync::Mutex::new(()))
}
fn load_index(root: &std::path::Path) -> RipvecIndex {
let source = resolve_model_source();
let guard = download_lock().lock().expect("download mutex poisoned");
let encoder = StaticEncoder::from_pretrained(&source).unwrap_or_else(|e| {
panic!(
"model2vec load failed for source {source:?}: {e}.\n\
If the HF Hub is unreachable from this network, pre-download:\n \
mkdir -p /tmp/potion-code-16M && \\\n \
for f in config.json tokenizer.json model.safetensors; do \\\n \
curl -sL -o \"/tmp/potion-code-16M/$f\" \\\n \
\"https://huggingface.co/minishlab/potion-code-16M/resolve/main/$f\"; \\\n \
done\n\
then re-run with: RIPVEC_SEMBLE_MODEL_PATH=/tmp/potion-code-16M cargo test \\\n \
--test ripvec_port_parity -- --ignored"
)
});
drop(guard);
let cfg = SearchConfig {
batch_size: 32,
max_tokens: 512,
chunk: ripvec_core::chunk::ChunkConfig {
max_chunk_bytes: 4096,
window_size: 2048,
window_overlap: 512,
},
text_mode: false,
cascade_dim: None,
file_type: None,
exclude_extensions: Vec::new(),
include_extensions: Vec::new(),
ignore_patterns: Vec::new(),
scope: ripvec_core::embed::Scope::All,
mode: SearchMode::Hybrid,
};
let profiler = Profiler::noop();
RipvecIndex::from_root(root, encoder, &cfg, &profiler, None, 0.0)
.expect("RipvecIndex build should succeed")
}
#[test]
#[ignore = "requires model2vec download (~16 MB on first run)"]
fn parity_nl_query_authentication_finds_auth_py() {
let tmp = tempfile::TempDir::new().unwrap();
let root = build_test_corpus(&tmp);
let index = load_index(&root);
let results = index.search("authentication", 5, SearchMode::Hybrid, None, None, None);
assert!(!results.is_empty(), "expected non-empty results");
let chunks = index.chunks();
let top_path = &chunks[results[0].0].file_path;
assert!(
top_path.contains("auth.py") || top_path.contains("auth_service.rs"),
"expected auth.py or auth_service.rs first; got {top_path}"
);
}
#[test]
#[ignore = "requires model2vec download (~16 MB on first run)"]
fn parity_symbol_query_authservice_finds_definition() {
let tmp = tempfile::TempDir::new().unwrap();
let root = build_test_corpus(&tmp);
let index = load_index(&root);
let results = index.search("AuthService", 5, SearchMode::Hybrid, None, None, None);
assert!(!results.is_empty(), "expected non-empty results");
let chunks = index.chunks();
let top_path = &chunks[results[0].0].file_path;
assert!(
top_path.contains("auth_service.rs"),
"expected auth_service.rs first; got {top_path}"
);
}
#[test]
#[ignore = "requires model2vec download (~16 MB on first run)"]
fn parity_stem_query_parser_finds_parser_rs() {
let tmp = tempfile::TempDir::new().unwrap();
let root = build_test_corpus(&tmp);
let index = load_index(&root);
let results = index.search("parse json", 5, SearchMode::Hybrid, None, None, None);
assert!(!results.is_empty(), "expected non-empty results");
let chunks = index.chunks();
let top_path = &chunks[results[0].0].file_path;
assert!(
top_path.contains("parser.rs"),
"expected parser.rs first; got {top_path}"
);
}
#[test]
#[ignore = "requires model2vec download (~16 MB on first run)"]
fn parity_test_file_penalty_keeps_tests_below_source() {
let tmp = tempfile::TempDir::new().unwrap();
let root = build_test_corpus(&tmp);
let index = load_index(&root);
let results = index.search("authenticate_user", 5, SearchMode::Hybrid, None, None, None);
let chunks = index.chunks();
let mut src_rank: Option<usize> = None;
let mut test_rank: Option<usize> = None;
for (rank, (idx, _)) in results.iter().enumerate() {
let path = &chunks[*idx].file_path;
if path.contains("src/auth.py") && src_rank.is_none() {
src_rank = Some(rank);
}
if path.contains("tests/test_auth.py") && test_rank.is_none() {
test_rank = Some(rank);
}
}
if let (Some(s), Some(t)) = (src_rank, test_rank) {
assert!(
s < t,
"src/auth.py (rank {s}) should rank above tests/test_auth.py (rank {t})"
);
} else if test_rank.is_some() && src_rank.is_none() {
panic!("test file present in results but source file absent");
}
}
#[test]
#[ignore = "requires model2vec download (~16 MB on first run)"]
fn parity_semantic_only_mode_returns_dense_top_k() {
let tmp = tempfile::TempDir::new().unwrap();
let root = build_test_corpus(&tmp);
let index = load_index(&root);
let results = index.search("verify token", 3, SearchMode::Semantic, None, None, None);
assert!(
!results.is_empty(),
"expected non-empty semantic-only results"
);
let chunks = index.chunks();
let top_path = &chunks[results[0].0].file_path;
assert!(
top_path.contains("auth"),
"expected an auth-related file first under semantic mode; got {top_path}"
);
}
#[test]
#[ignore = "requires model2vec download (~16 MB on first run)"]
fn parity_keyword_only_mode_finds_path_via_enrichment() {
let tmp = tempfile::TempDir::new().unwrap();
let root = build_test_corpus(&tmp);
let index = load_index(&root);
let results = index.search("auth_service", 5, SearchMode::Keyword, None, None, None);
assert!(!results.is_empty(), "expected non-empty BM25-only results");
let chunks = index.chunks();
let top_path = &chunks[results[0].0].file_path;
assert!(
top_path.contains("auth_service.rs"),
"expected auth_service.rs first under keyword mode; got {top_path}"
);
}