jin 0.1.0 - Docs.rs

//! SIFT-128 Benchmark - Real ANN Benchmark Dataset
//!
//! Downloads and benchmarks against the SIFT-128-euclidean dataset from ann-benchmarks.
//!
//! Dataset: 1M 128-dimensional vectors with 10K test queries and ground truth.
//! Source: http://ann-benchmarks.com/sift-128-euclidean.hdf5
//!
//! ```bash
//! # Download dataset first (501MB)
//! curl -o data/sift-128-euclidean.hdf5 http://ann-benchmarks.com/sift-128-euclidean.hdf5
//!
//! # Run benchmark
//! cargo run --example sift_benchmark --release --features hdf5
//! ```

use std::path::Path;
use std::time::Instant;

fn main() {
    println!("SIFT-128 ANN Benchmark");
    println!("======================\n");

    let dataset_path = "data/sift-128-euclidean.hdf5";

    if !Path::new(dataset_path).exists() {
        println!("Dataset not found at: {}", dataset_path);
        println!();
        println!("To run this benchmark, download the SIFT-128 dataset:");
        println!();
        println!("  mkdir -p data");
        println!(
            "  curl -o {} http://ann-benchmarks.com/sift-128-euclidean.hdf5",
            dataset_path
        );
        println!();
        println!("Dataset size: 501MB");
        println!("Alternative smaller datasets:");
        println!("  - GloVe-25 (121MB):  http://ann-benchmarks.com/glove-25-angular.hdf5");
        println!(
            "  - Fashion-MNIST (217MB): http://ann-benchmarks.com/fashion-mnist-784-euclidean.hdf5"
        );
        println!();

        // Run a mini demo with synthetic data instead
        println!("Running mini demo with synthetic data instead...\n");
        run_synthetic_demo();
        return;
    }

    // When HDF5 support is available:
    #[cfg(feature = "hdf5")]
    {
        run_real_benchmark(dataset_path);
    }

    #[cfg(not(feature = "hdf5"))]
    {
        println!("HDF5 feature not enabled. Compile with --features hdf5");
        println!("Running synthetic demo instead...\n");
        run_synthetic_demo();
    }
}

fn run_synthetic_demo() {
    use jin::hnsw::HNSWIndex;

    let n = 50_000;
    let dim = 128;
    let n_queries = 1000;
    let k = 10;

    println!("Synthetic benchmark: {} vectors, {} dims", n, dim);

    // Generate normalized vectors
    let vectors: Vec<Vec<f32>> = (0..n)
        .map(|i| normalize(&generate_vector(i, dim)))
        .collect();

    // Build index
    let build_start = Instant::now();
    let mut index = HNSWIndex::new(dim, 16, 200).unwrap();
    for (i, vec) in vectors.iter().enumerate() {
        index.add(i as u32, vec.clone()).unwrap();
    }
    index.build().unwrap();
    let build_time = build_start.elapsed();
    println!("Build time: {:?}", build_time);

    // Generate queries (perturbations of existing vectors)
    let queries: Vec<Vec<f32>> = (0..n_queries)
        .map(|i| {
            let base_idx = (i * 7) % n;
            let perturbed: Vec<f32> = vectors[base_idx]
                .iter()
                .enumerate()
                .map(|(j, &v)| {
                    let noise = ((i * dim + j) as f32 * 0.0001).sin() * 0.1;
                    v + noise
                })
                .collect();
            normalize(&perturbed)
        })
        .collect();

    // Benchmark HNSW
    let ef = 100;
    let hnsw_start = Instant::now();
    let mut hnsw_results = Vec::with_capacity(n_queries);
    for query in &queries {
        let results = index.search(query, k, ef).unwrap();
        hnsw_results.push(results);
    }
    let hnsw_time = hnsw_start.elapsed();

    // Brute force for ground truth
    let brute_start = Instant::now();
    let mut brute_results = Vec::with_capacity(n_queries);
    for query in &queries {
        let mut distances: Vec<_> = vectors
            .iter()
            .enumerate()
            .map(|(i, v)| (i, cosine_distance(query, v)))
            .collect();
        distances.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap());
        brute_results.push(distances.into_iter().take(k).collect::<Vec<_>>());
    }
    let brute_time = brute_start.elapsed();

    // Calculate recall
    let mut total_recall = 0.0;
    for (hnsw, brute) in hnsw_results.iter().zip(brute_results.iter()) {
        let hnsw_ids: std::collections::HashSet<u32> = hnsw.iter().map(|r| r.0).collect();
        let brute_ids: std::collections::HashSet<u32> = brute.iter().map(|r| r.0 as u32).collect();
        let intersection = hnsw_ids.intersection(&brute_ids).count();
        total_recall += intersection as f64 / k as f64;
    }
    let avg_recall = total_recall / n_queries as f64;

    let hnsw_qps = n_queries as f64 / hnsw_time.as_secs_f64();
    let brute_qps = n_queries as f64 / brute_time.as_secs_f64();

    println!("\n--- Results ---");
    println!("HNSW:        {:.1} QPS ({:?})", hnsw_qps, hnsw_time);
    println!("Brute force: {:.1} QPS ({:?})", brute_qps, brute_time);
    println!(
        "Speedup:     {:.1}x",
        brute_time.as_secs_f64() / hnsw_time.as_secs_f64()
    );
    println!("Recall@{}:   {:.1}%", k, avg_recall * 100.0);
}

#[cfg(feature = "hdf5")]
fn run_real_benchmark(path: &str) {
    use hdf5::File;
    use jin::hnsw::HNSWIndex;

    println!("Loading SIFT-128 dataset from {}...", path);

    let file = File::open(path).expect("Failed to open HDF5 file");

    // Load train vectors
    let train = file.dataset("train").expect("No 'train' dataset");
    let train_data: ndarray::Array2<f32> = train.read().expect("Failed to read train data");
    let (n, dim) = train_data.dim();
    println!("Train: {} vectors, {} dimensions", n, dim);

    // Load test vectors
    let test = file.dataset("test").expect("No 'test' dataset");
    let test_data: ndarray::Array2<f32> = test.read().expect("Failed to read test data");
    let n_queries = test_data.nrows();
    println!("Test: {} queries", n_queries);

    // Load ground truth (neighbors)
    let neighbors = file.dataset("neighbors").expect("No 'neighbors' dataset");
    let gt_data: ndarray::Array2<i32> = neighbors.read().expect("Failed to read neighbors");

    // Build index
    let build_start = Instant::now();
    let mut index = HNSWIndex::new(dim, 16, 200).unwrap();
    for (i, row) in train_data.rows().into_iter().enumerate() {
        let vec: Vec<f32> = row.to_vec();
        index.add(i as u32, vec).unwrap();
    }
    let build_time = build_start.elapsed();
    println!("Build time: {:?}", build_time);

    // Search and evaluate
    let k = 10;
    let ef_values = [50, 100, 200, 400];

    println!(
        "\n{:>8} {:>12} {:>12} {:>10}",
        "ef", "QPS", "Latency", "Recall@10"
    );
    println!("{}", "-".repeat(50));

    for ef in ef_values {
        let search_start = Instant::now();
        let mut total_recall = 0.0;

        for (i, query_row) in test_data.rows().into_iter().enumerate() {
            let query: Vec<f32> = query_row.to_vec();
            let results = index.search(&query, k, ef).unwrap();

            // Calculate recall against ground truth
            let gt: std::collections::HashSet<i32> =
                gt_data.row(i).iter().take(k).copied().collect();
            let found: std::collections::HashSet<i32> =
                results.iter().map(|r| r.0 as i32).collect();
            let intersection = gt.intersection(&found).count();
            total_recall += intersection as f64 / k as f64;
        }

        let search_time = search_start.elapsed();
        let qps = n_queries as f64 / search_time.as_secs_f64();
        let latency_us = search_time.as_micros() as f64 / n_queries as f64;
        let recall = total_recall / n_queries as f64 * 100.0;

        println!(
            "{:>8} {:>12.1} {:>10.1}us {:>9.1}%",
            ef, qps, latency_us, recall
        );
    }
}

fn generate_vector(seed: usize, dim: usize) -> Vec<f32> {
    (0..dim)
        .map(|j| {
            let x = (seed * dim + j) as f32;
            (x * 0.618033988).fract() * 2.0 - 1.0
        })
        .collect()
}

fn normalize(v: &[f32]) -> Vec<f32> {
    let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
    if norm > f32::EPSILON {
        v.iter().map(|x| x / norm).collect()
    } else {
        v.to_vec()
    }
}

fn cosine_distance(a: &[f32], b: &[f32]) -> f32 {
    let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
    1.0 - dot / (norm_a * norm_b + f32::EPSILON)
}