ruve-db 0.1.0

A hybrid vector and full-text search database with HNSW approximate nearest-neighbour indexing and BM25
Documentation
use std::collections::HashSet;
use std::io::Write;
use std::time::Instant;

use ruve::database::Database;

// ── Scenarios ────────────────────────────────────────────────────────────────

struct Scenario {
    key:   &'static str,
    label: &'static str,
    desc:  &'static str,
    n:     usize,
    dims:  usize,
}

const SCENARIOS: &[Scenario] = &[
    Scenario { key: "xs",      label: "XS", desc: "1 K × 128d",   n:   1_000, dims: 128 },
    Scenario { key: "small",   label: "S",  desc: "10 K × 128d",  n:  10_000, dims: 128 },
    Scenario { key: "medium",  label: "M",  desc: "50 K × 128d",  n:  50_000, dims: 128 },
    Scenario { key: "large",   label: "L",  desc: "100 K × 128d", n: 100_000, dims: 128 },
    Scenario { key: "highdim", label: "HD", desc: "10 K × 768d",  n:  10_000, dims: 768  },
];

// ── Data generation ───────────────────────────────────────────────────────────

const VOCABULARY: &[&str] = &[
    "alpha", "beta", "gamma", "delta", "epsilon", "zeta", "eta", "theta",
    "iota", "kappa", "lambda", "mu", "nu", "xi", "omicron", "pi", "rho",
    "sigma", "tau", "upsilon", "phi", "chi", "psi", "omega",
];

const SEARCH_TERMS: &[&str] = &[
    "alpha beta",
    "gamma delta epsilon",
    "omega psi chi",
    "sigma tau",
    "lambda mu nu",
    "theta iota kappa",
];

fn random_unit_vector(dims: usize) -> Vec<f32> {
    let v: Vec<f32> = (0..dims).map(|_| rand::random::<f32>() * 2.0 - 1.0).collect();
    let norm = v.iter().map(|x| x * x).sum::<f32>().sqrt().max(1e-10);
    v.iter().map(|x| x / norm).collect()
}

fn random_text() -> String {
    let count = 5 + (rand::random::<u8>() as usize % 16);
    (0..count)
        .map(|_| VOCABULARY[rand::random::<u32>() as usize % VOCABULARY.len()])
        .collect::<Vec<_>>()
        .join(" ")
}

// ── Stats ─────────────────────────────────────────────────────────────────────

fn percentile(sorted: &[f64], p: f64) -> f64 {
    if sorted.is_empty() { return 0.0; }
    let idx = ((p / 100.0) * (sorted.len() as f64 - 1.0)) as usize;
    sorted[idx.min(sorted.len() - 1)]
}

// ── Database setup ────────────────────────────────────────────────────────────

fn make_db(key: &str) -> Database {
    let dir = format!("/tmp/ruve_bench_{key}");
    std::fs::create_dir_all(&dir).unwrap();
    let mut db = Database::new(
        &format!("{dir}/data.bin"),
        &format!("{dir}/index.json"),
        &format!("{dir}/bm25.json"),
        &format!("{dir}/hnsw.json"),
        &format!("{dir}/graph.bin"),
    );
    db.wipe();
    db
}

// ── Scenario runner ───────────────────────────────────────────────────────────

fn run_scenario(scenario: &Scenario, n_queries: usize, recall_samples: usize, k: usize) {
    println!("\n[{}] {}", scenario.label, scenario.desc);
    println!("  {}", "".repeat(70));

    // ── Generate data ─────────────────────────────────────────────────────
    eprint!("  Generating {} vectors ({} dims)...", scenario.n, scenario.dims);
    std::io::stderr().flush().ok();
    let data_vecs:  Vec<Vec<f32>> = (0..scenario.n).map(|_| random_unit_vector(scenario.dims)).collect();
    let query_vecs: Vec<Vec<f32>> = (0..n_queries).map(|_| random_unit_vector(scenario.dims)).collect();
    let texts:      Vec<String>   = (0..scenario.n).map(|_| random_text()).collect();
    eprintln!(" done");

    // ── Insert ────────────────────────────────────────────────────────────
    let mut db = make_db(scenario.key);
    let t0 = Instant::now();
    for (i, (v, text)) in data_vecs.iter().zip(texts.iter()).enumerate() {
        db.insert_raw(v.clone(), text, None);
        if (i + 1) % 10_000 == 0 {
            eprint!("\r  Inserting {}/{}", i + 1, scenario.n);
            std::io::stderr().flush().ok();
        }
    }
    let insert_s = t0.elapsed().as_secs_f64();
    println!("\r  {:<14} {:>8} docs     {:>9.0} ops/s     {:.2}s",
        "Insert", scenario.n, scenario.n as f64 / insert_s, insert_s);

    // ── Warmup ────────────────────────────────────────────────────────────
    for v in query_vecs.iter().take(20) {
        db.search_hnsw(v, k * 4);
    }

    // ── Vector search ─────────────────────────────────────────────────────
    let mut latencies: Vec<f64> = Vec::with_capacity(n_queries);
    let t0 = Instant::now();
    for v in &query_vecs {
        let t = Instant::now();
        db.search_hnsw(v, k * 4);
        latencies.push(t.elapsed().as_secs_f64() * 1_000.0);
    }
    let search_s = t0.elapsed().as_secs_f64();
    latencies.sort_by(|a, b| a.partial_cmp(b).unwrap());
    println!("  {:<14} {:>8} queries  {:>9.0} qps       p50={:.2}ms  p95={:.2}ms  p99={:.2}ms",
        "Vector search", n_queries, n_queries as f64 / search_s,
        percentile(&latencies, 50.0),
        percentile(&latencies, 95.0),
        percentile(&latencies, 99.0),
    );

    // ── Text search ───────────────────────────────────────────────────────
    let mut text_latencies: Vec<f64> = Vec::with_capacity(n_queries);
    let t0 = Instant::now();
    for i in 0..n_queries {
        let term = SEARCH_TERMS[i % SEARCH_TERMS.len()];
        let t = Instant::now();
        db.text_search(term, k);
        text_latencies.push(t.elapsed().as_secs_f64() * 1_000.0);
    }
    let text_s = t0.elapsed().as_secs_f64();
    text_latencies.sort_by(|a, b| a.partial_cmp(b).unwrap());
    println!("  {:<14} {:>8} queries  {:>9.0} qps       p50={:.2}ms  p95={:.2}ms  p99={:.2}ms",
        "Text search", n_queries, n_queries as f64 / text_s,
        percentile(&text_latencies, 50.0),
        percentile(&text_latencies, 95.0),
        percentile(&text_latencies, 99.0),
    );

    // ── Recall@k ──────────────────────────────────────────────────────────
    // skip for large datasets where brute-force search_scored becomes too slow
    if scenario.n <= 50_000 {
        let mut hits = 0usize;
        let t0 = Instant::now();
        for v in query_vecs.iter().take(recall_samples) {
            // ground truth: linear scan over all vectors
            let brute: HashSet<String> = db.search_scored(v, k)
                .into_iter()
                .map(|(_, r)| r.id)
                .collect();
            let hnsw: HashSet<String> = db.search_hnsw(v, k * 4)
                .into_iter()
                .take(k)
                .map(|r| r.id)
                .collect();
            hits += brute.intersection(&hnsw).count();
        }
        let recall_s = t0.elapsed().as_secs_f64();
        let recall = hits as f64 / (recall_samples * k) as f64;
        println!("  {:<14} {:>8} queries  recall@{k} = {:.3}          {:.2}s",
            "Recall", recall_samples, recall, recall_s);
    } else {
        println!("  {:<14}          skipped (n > 50 K — brute-force too slow)", "Recall");
    }
}

// ── Entry point ───────────────────────────────────────────────────────────────

fn main() {
    let args: Vec<String> = std::env::args().skip(1).collect();
    let keys: Vec<&str> = if args.is_empty() {
        vec!["xs", "small"]
    } else {
        args.iter().map(|s| s.as_str()).collect()
    };

    let k              = 10;
    let n_queries      = 200;
    let recall_samples = 50;

    println!("╔══════════════════════════════════════════════════════════════════════╗");
    println!("║                      RuVe HNSW Benchmark                            ║");
    println!("╚══════════════════════════════════════════════════════════════════════╝");
    println!("k={}  queries={}  recall_samples={}", k, n_queries, recall_samples);
    println!("Available scenarios: xs · small · medium · large · highdim");

    let unknown: Vec<&&str> = keys.iter()
        .filter(|k| !SCENARIOS.iter().any(|s| &s.key == *k))
        .collect();
    if !unknown.is_empty() {
        eprintln!("unknown scenarios: {}", unknown.iter().map(|s| **s).collect::<Vec<_>>().join(", "));
        std::process::exit(1);
    }

    for key in &keys {
        let scenario = SCENARIOS.iter().find(|s| &s.key == key).unwrap();
        run_scenario(scenario, n_queries, recall_samples, k);
    }

    println!("\nDone.");
}