Skip to main content

simjoin_pypi/
simjoin_pypi.rs

1//! Real-data `simjoin` bench: load a binary Type-3 corpus (built by `scripts/simjoin-corpus.py`
2//! from the top-300 PyPI snapshot) and time `cosine_join` on it.
3//!
4//! `cargo run --release --example simjoin_pypi -- <corpus.bin> [threshold] [reps]`
5//! defaults: threshold=0.8 reps=3.  Thread count via `RAYON_NUM_THREADS`.
6
7#![allow(
8    clippy::cast_possible_truncation,
9    clippy::cast_precision_loss,
10    clippy::many_single_char_names,
11    clippy::doc_markdown
12)]
13
14use std::time::Instant;
15
16use difflib_fast::simjoin::{cosine_join, Corpus};
17#[cfg(feature = "profiling")]
18use difflib_fast::simjoin::cosine_join_counts;
19
20fn le_u32(b: &[u8], p: usize) -> u32 {
21    u32::from_le_bytes(b[p..p + 4].try_into().unwrap())
22}
23
24fn arg<T: std::str::FromStr>(i: usize, def: T) -> T {
25    std::env::args().nth(i).and_then(|s| s.parse().ok()).unwrap_or(def)
26}
27
28/// Parse the `SIMJOIN1` binary into raw `(dim, weight)` rows (weights kept as `f64`; `Corpus`
29/// L2-normalises and merges duplicate dims).
30fn load(path: &str) -> Vec<Vec<(u32, f64)>> {
31    let b = std::fs::read(path).expect("read corpus");
32    assert!(b.len() >= 16, "corpus too small");
33    let magic = u64::from_le_bytes(b[0..8].try_into().unwrap());
34    assert_eq!(magic, 0x5349_4D4A_4F49_4E31, "bad magic");
35    let n = le_u32(&b, 8) as usize;
36    let mut p = 16usize;
37    let mut rows = Vec::with_capacity(n);
38    for _ in 0..n {
39        let nnz = le_u32(&b, p) as usize;
40        p += 4;
41        let mut row = Vec::with_capacity(nnz);
42        for _ in 0..nnz {
43            let d = le_u32(&b, p);
44            let w = f64::from(f32::from_le_bytes(b[p + 4..p + 8].try_into().unwrap()));
45            p += 8;
46            row.push((d, w));
47        }
48        rows.push(row);
49    }
50    rows
51}
52
53fn main() {
54    let path: String = arg(1, "perf-local/pypi-type3.simjoin.bin".to_string());
55    let t: f64 = arg(2, 0.8);
56    let reps: usize = arg(3, 3);
57
58    let mut rows = load(&path);
59    // Optional row subset (SJ_NSUB) to bench the small-corpus regime (e.g. find-dup-defs scale
60    // ~3216 functions) instead of the full 287k bandwidth-bound regime.
61    if let Ok(nsub) = std::env::var("SJ_NSUB") {
62        if let Ok(k) = nsub.parse::<usize>() {
63            if rows.len() > k {
64                rows.truncate(k);
65            }
66        }
67    }
68    let n = rows.len();
69    let nnz: usize = rows.iter().map(Vec::len).sum();
70
71    let b0 = Instant::now();
72    let corpus = Corpus::from_rows(&rows);
73    let build_ms = b0.elapsed().as_secs_f64() * 1000.0;
74
75    #[cfg(feature = "profiling")]
76    if std::env::var("STATS").is_ok() {
77        let (ncand, survivors, pairs) = cosine_join_counts(&corpus, t);
78        eprintln!(
79            "STATS n={n} t={t} | candidates={ncand} survivors(cos_full)={survivors} pairs={pairs} \
80             | prune_pass={:.4} survivor_precision={:.3}",
81            survivors as f64 / ncand.max(1) as f64,
82            pairs as f64 / survivors.max(1) as f64,
83        );
84    }
85
86    let mut ms: Vec<f64> = Vec::with_capacity(reps);
87    let mut npairs = 0usize;
88    for _ in 0..reps {
89        let t0 = Instant::now();
90        let pairs = cosine_join(&corpus, t);
91        ms.push(t0.elapsed().as_secs_f64() * 1000.0);
92        npairs = pairs.len();
93        std::hint::black_box(&pairs);
94    }
95    ms.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
96    eprintln!(
97        "pypi-type3 n={n} nnz_total={nnz} mean_nnz={:.1} t={t} | build={build_ms:.0}ms | \
98         join: min={:.1}ms median={:.1}ms | pairs={npairs}",
99        nnz as f64 / n as f64,
100        ms[0],
101        ms[reps / 2],
102    );
103}