simjoin_bench/
simjoin_bench.rs1#![allow(clippy::cast_possible_truncation, clippy::cast_precision_loss, clippy::cast_sign_loss)]
8
9use std::time::Instant;
10
11use difflib_fast::simjoin::{cosine_join, Corpus};
12#[cfg(feature = "profiling")]
13use difflib_fast::simjoin::cosine_join_counts;
14
15fn arg<T: std::str::FromStr>(i: usize, def: T) -> T {
16 std::env::args().nth(i).and_then(|s| s.parse().ok()).unwrap_or(def)
17}
18
19fn gen(n: usize, nnz: usize, ndims: usize, seed: u64) -> Vec<Vec<(u32, f64)>> {
23 let mut s = seed;
24 let mut next = move || {
25 s ^= s << 13;
26 s ^= s >> 7;
27 s ^= s << 17;
28 s
29 };
30 let mut sets: Vec<Vec<u32>> = Vec::with_capacity(n);
33 for i in 0..n {
34 let dup = i > 0 && (next() % 100) < 5;
35 let mut v: Vec<u32> = if dup {
36 let src = (next() as usize) % i;
37 let mut c = sets[src].clone();
38 if !c.is_empty() {
39 let k = (next() as usize) % c.len();
40 c[k] = (next() % ndims as u64) as u32; }
42 c
43 } else {
44 (0..nnz)
45 .map(|_| {
46 let u = (next() >> 11) as f64 / (1u64 << 53) as f64; ((u * u * u) * ndims as f64) as u32 % ndims as u32
48 })
49 .collect()
50 };
51 v.sort_unstable();
52 v.dedup();
53 sets.push(v);
54 }
55 let mut df = vec![0u32; ndims];
56 for v in &sets {
57 for &d in v {
58 df[d as usize] += 1;
59 }
60 }
61 sets.into_iter()
63 .map(|v| {
64 v.into_iter()
65 .map(|d| {
66 let idf = (n as f64 / f64::from(df[d as usize]).max(1.0)).ln();
67 (d, idf)
68 })
69 .collect()
70 })
71 .collect()
72}
73
74fn main() {
75 let n: usize = arg(1, 100_000);
76 let nnz: usize = arg(2, 14);
77 let ndims: usize = arg(3, 20_000);
78 let t: f64 = arg(4, 0.7);
79 let reps: usize = arg(5, 3);
80
81 let rows = gen(n, nnz, ndims, 0x1234_5678_9abc_def1);
82 let build0 = Instant::now();
83 let corpus = Corpus::from_rows(&rows);
84 let build_ms = build0.elapsed().as_secs_f64() * 1000.0;
85
86 #[cfg(feature = "profiling")]
89 if std::env::var("STATS").is_ok() {
90 let (ncand, survivors, pairs) = cosine_join_counts(&corpus, t);
91 eprintln!(
92 "STATS n={n} t={t} | candidates={ncand} survivors(cos_full)={survivors} pairs={pairs} \
93 | prune_pass={:.4} cos_full_saved={:.4} survivor_precision={:.3}",
94 survivors as f64 / ncand.max(1) as f64,
95 1.0 - survivors as f64 / ncand.max(1) as f64,
96 pairs as f64 / survivors.max(1) as f64,
97 );
98 }
99
100 let mut ms: Vec<f64> = Vec::with_capacity(reps);
101 let mut npairs = 0usize;
102 for _ in 0..reps {
103 let t0 = Instant::now();
104 let pairs = cosine_join(&corpus, t);
105 ms.push(t0.elapsed().as_secs_f64() * 1000.0);
106 npairs = pairs.len();
107 std::hint::black_box(&pairs);
108 }
109 ms.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
110 eprintln!(
111 "n={n} nnz={nnz} ndims={ndims} t={t} | build={build_ms:.0}ms | join: min={:.1}ms median={:.1}ms | pairs={npairs}",
112 ms[0],
113 ms[reps / 2],
114 );
115}