Skip to main content

simjoin_bench/
simjoin_bench.rs

1//! Bench harness for `simjoin::cosine_join` on a synthetic Zipfian-skewed sparse corpus (few common
2//! dims, many rare — like IDF-weighted lines/tokens). Reproducible, no I/O, scalable.
3//!
4//! `cargo run --release --example simjoin_bench -- [n] [nnz] [ndims] [threshold] [reps]`
5//! defaults: n=100000 nnz=14 ndims=20000 t=0.7 reps=3
6
7#![allow(clippy::cast_possible_truncation, clippy::cast_precision_loss, clippy::cast_sign_loss)]
8
9use std::time::Instant;
10
11use difflib_fast::simjoin::{cosine_join, Corpus};
12#[cfg(feature = "profiling")]
13use difflib_fast::simjoin::cosine_join_counts;
14
15fn arg<T: std::str::FromStr>(i: usize, def: T) -> T {
16    std::env::args().nth(i).and_then(|s| s.parse().ok()).unwrap_or(def)
17}
18
19/// Deterministic xorshift → IDF-weighted sparse rows. Each vector draws `nnz` distinct dims with a
20/// cubic bias toward low ids (so low ids are common, high ids rare); the per-dim weight is its IDF
21/// `ln(n / df)` computed from the generated corpus — the realistic weighted-cosine input shape.
22fn gen(n: usize, nnz: usize, ndims: usize, seed: u64) -> Vec<Vec<(u32, f64)>> {
23    let mut s = seed;
24    let mut next = move || {
25        s ^= s << 13;
26        s ^= s >> 7;
27        s ^= s << 17;
28        s
29    };
30    // 1. distinct dims per vector (cubic skew → Zipfian-ish frequencies). ~5% of vectors are planted
31    //    near-duplicates of an earlier one (so real cosine clusters exist → the verify path runs).
32    let mut sets: Vec<Vec<u32>> = Vec::with_capacity(n);
33    for i in 0..n {
34        let dup = i > 0 && (next() % 100) < 5;
35        let mut v: Vec<u32> = if dup {
36            let src = (next() as usize) % i;
37            let mut c = sets[src].clone();
38            if !c.is_empty() {
39                let k = (next() as usize) % c.len();
40                c[k] = (next() % ndims as u64) as u32; // mutate one dim
41            }
42            c
43        } else {
44            (0..nnz)
45                .map(|_| {
46                    let u = (next() >> 11) as f64 / (1u64 << 53) as f64; // [0,1)
47                    ((u * u * u) * ndims as f64) as u32 % ndims as u32
48                })
49                .collect()
50        };
51        v.sort_unstable();
52        v.dedup();
53        sets.push(v);
54    }
55    let mut df = vec![0u32; ndims];
56    for v in &sets {
57        for &d in v {
58            df[d as usize] += 1;
59        }
60    }
61    // 2. weight each present dim by its IDF.
62    sets.into_iter()
63        .map(|v| {
64            v.into_iter()
65                .map(|d| {
66                    let idf = (n as f64 / f64::from(df[d as usize]).max(1.0)).ln();
67                    (d, idf)
68                })
69                .collect()
70        })
71        .collect()
72}
73
74fn main() {
75    let n: usize = arg(1, 100_000);
76    let nnz: usize = arg(2, 14);
77    let ndims: usize = arg(3, 20_000);
78    let t: f64 = arg(4, 0.7);
79    let reps: usize = arg(5, 3);
80
81    let rows = gen(n, nnz, ndims, 0x1234_5678_9abc_def1);
82    let build0 = Instant::now();
83    let corpus = Corpus::from_rows(&rows);
84    let build_ms = build0.elapsed().as_secs_f64() * 1000.0;
85
86    // Strategy diagnostic (profiling builds only): posting touches / candidates / pairs. The
87    // candidates-per-pair ratio decides whether to prune harder or speed the dot up.
88    #[cfg(feature = "profiling")]
89    if std::env::var("STATS").is_ok() {
90        let (ncand, survivors, pairs) = cosine_join_counts(&corpus, t);
91        eprintln!(
92            "STATS n={n} t={t} | candidates={ncand} survivors(cos_full)={survivors} pairs={pairs} \
93             | prune_pass={:.4} cos_full_saved={:.4} survivor_precision={:.3}",
94            survivors as f64 / ncand.max(1) as f64,
95            1.0 - survivors as f64 / ncand.max(1) as f64,
96            pairs as f64 / survivors.max(1) as f64,
97        );
98    }
99
100    let mut ms: Vec<f64> = Vec::with_capacity(reps);
101    let mut npairs = 0usize;
102    for _ in 0..reps {
103        let t0 = Instant::now();
104        let pairs = cosine_join(&corpus, t);
105        ms.push(t0.elapsed().as_secs_f64() * 1000.0);
106        npairs = pairs.len();
107        std::hint::black_box(&pairs);
108    }
109    ms.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
110    eprintln!(
111        "n={n} nnz={nnz} ndims={ndims} t={t} | build={build_ms:.0}ms | join: min={:.1}ms median={:.1}ms | pairs={npairs}",
112        ms[0],
113        ms[reps / 2],
114    );
115}