use std::collections::HashSet;
use std::io::Write;
use std::time::Instant;
use ruve::database::Database;
struct Scenario {
key: &'static str,
label: &'static str,
desc: &'static str,
n: usize,
dims: usize,
}
const SCENARIOS: &[Scenario] = &[
Scenario { key: "xs", label: "XS", desc: "1 K × 128d", n: 1_000, dims: 128 },
Scenario { key: "small", label: "S", desc: "10 K × 128d", n: 10_000, dims: 128 },
Scenario { key: "medium", label: "M", desc: "50 K × 128d", n: 50_000, dims: 128 },
Scenario { key: "large", label: "L", desc: "100 K × 128d", n: 100_000, dims: 128 },
Scenario { key: "highdim", label: "HD", desc: "10 K × 768d", n: 10_000, dims: 768 },
];
const VOCABULARY: &[&str] = &[
"alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
"iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
"sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
];
const SEARCH_TERMS: &[&str] = &[
"alpha beta",
"gamma delta epsilon",
"omega psi chi",
"sigma tau",
"lambda mu nu",
"theta iota kappa",
];
fn random_unit_vector(dims: usize) -> Vec<f32> {
let v: Vec<f32> = (0..dims).map(|_| rand::random::<f32>() * 2.0 - 1.0).collect();
let norm = v.iter().map(|x| x * x).sum::<f32>().sqrt().max(1e-10);
v.iter().map(|x| x / norm).collect()
}
fn random_text() -> String {
let count = 5 + (rand::random::<u8>() as usize % 16);
(0..count)
.map(|_| VOCABULARY[rand::random::<u32>() as usize % VOCABULARY.len()])
.collect::<Vec<_>>()
.join(" ")
}
fn percentile(sorted: &[f64], p: f64) -> f64 {
if sorted.is_empty() { return 0.0; }
let idx = ((p / 100.0) * (sorted.len() as f64 - 1.0)) as usize;
sorted[idx.min(sorted.len() - 1)]
}
fn make_db(key: &str) -> Database {
let dir = format!("/tmp/ruve_bench_{key}");
std::fs::create_dir_all(&dir).unwrap();
let mut db = Database::new(
&format!("{dir}/data.bin"),
&format!("{dir}/index.json"),
&format!("{dir}/bm25.json"),
&format!("{dir}/hnsw.json"),
&format!("{dir}/graph.bin"),
);
db.wipe();
db
}
fn run_scenario(scenario: &Scenario, n_queries: usize, recall_samples: usize, k: usize) {
println!("\n[{}] {}", scenario.label, scenario.desc);
println!(" {}", "─".repeat(70));
eprint!(" Generating {} vectors ({} dims)...", scenario.n, scenario.dims);
std::io::stderr().flush().ok();
let data_vecs: Vec<Vec<f32>> = (0..scenario.n).map(|_| random_unit_vector(scenario.dims)).collect();
let query_vecs: Vec<Vec<f32>> = (0..n_queries).map(|_| random_unit_vector(scenario.dims)).collect();
let texts: Vec<String> = (0..scenario.n).map(|_| random_text()).collect();
eprintln!(" done");
let mut db = make_db(scenario.key);
let t0 = Instant::now();
for (i, (v, text)) in data_vecs.iter().zip(texts.iter()).enumerate() {
db.insert_raw(v.clone(), text, None);
if (i + 1) % 10_000 == 0 {
eprint!("\r Inserting {}/{}", i + 1, scenario.n);
std::io::stderr().flush().ok();
}
}
let insert_s = t0.elapsed().as_secs_f64();
println!("\r {:<14} {:>8} docs {:>9.0} ops/s {:.2}s",
"Insert", scenario.n, scenario.n as f64 / insert_s, insert_s);
for v in query_vecs.iter().take(20) {
db.search_hnsw(v, k * 4);
}
let mut latencies: Vec<f64> = Vec::with_capacity(n_queries);
let t0 = Instant::now();
for v in &query_vecs {
let t = Instant::now();
db.search_hnsw(v, k * 4);
latencies.push(t.elapsed().as_secs_f64() * 1_000.0);
}
let search_s = t0.elapsed().as_secs_f64();
latencies.sort_by(|a, b| a.partial_cmp(b).unwrap());
println!(" {:<14} {:>8} queries {:>9.0} qps p50={:.2}ms p95={:.2}ms p99={:.2}ms",
"Vector search", n_queries, n_queries as f64 / search_s,
percentile(&latencies, 50.0),
percentile(&latencies, 95.0),
percentile(&latencies, 99.0),
);
let mut text_latencies: Vec<f64> = Vec::with_capacity(n_queries);
let t0 = Instant::now();
for i in 0..n_queries {
let term = SEARCH_TERMS[i % SEARCH_TERMS.len()];
let t = Instant::now();
db.text_search(term, k);
text_latencies.push(t.elapsed().as_secs_f64() * 1_000.0);
}
let text_s = t0.elapsed().as_secs_f64();
text_latencies.sort_by(|a, b| a.partial_cmp(b).unwrap());
println!(" {:<14} {:>8} queries {:>9.0} qps p50={:.2}ms p95={:.2}ms p99={:.2}ms",
"Text search", n_queries, n_queries as f64 / text_s,
percentile(&text_latencies, 50.0),
percentile(&text_latencies, 95.0),
percentile(&text_latencies, 99.0),
);
if scenario.n <= 50_000 {
let mut hits = 0usize;
let t0 = Instant::now();
for v in query_vecs.iter().take(recall_samples) {
let brute: HashSet<String> = db.search_scored(v, k)
.into_iter()
.map(|(_, r)| r.id)
.collect();
let hnsw: HashSet<String> = db.search_hnsw(v, k * 4)
.into_iter()
.take(k)
.map(|r| r.id)
.collect();
hits += brute.intersection(&hnsw).count();
}
let recall_s = t0.elapsed().as_secs_f64();
let recall = hits as f64 / (recall_samples * k) as f64;
println!(" {:<14} {:>8} queries recall@{k} = {:.3} {:.2}s",
"Recall", recall_samples, recall, recall_s);
} else {
println!(" {:<14} skipped (n > 50 K — brute-force too slow)", "Recall");
}
}
fn main() {
let args: Vec<String> = std::env::args().skip(1).collect();
let keys: Vec<&str> = if args.is_empty() {
vec!["xs", "small"]
} else {
args.iter().map(|s| s.as_str()).collect()
};
let k = 10;
let n_queries = 200;
let recall_samples = 50;
println!("╔══════════════════════════════════════════════════════════════════════╗");
println!("║ RuVe HNSW Benchmark ║");
println!("╚══════════════════════════════════════════════════════════════════════╝");
println!("k={} queries={} recall_samples={}", k, n_queries, recall_samples);
println!("Available scenarios: xs · small · medium · large · highdim");
let unknown: Vec<&&str> = keys.iter()
.filter(|k| !SCENARIOS.iter().any(|s| &s.key == *k))
.collect();
if !unknown.is_empty() {
eprintln!("unknown scenarios: {}", unknown.iter().map(|s| **s).collect::<Vec<_>>().join(", "));
std::process::exit(1);
}
for key in &keys {
let scenario = SCENARIOS.iter().find(|s| &s.key == key).unwrap();
run_scenario(scenario, n_queries, recall_samples, k);
}
println!("\nDone.");
}