simjoin_pypi/
simjoin_pypi.rs1#![allow(
8 clippy::cast_possible_truncation,
9 clippy::cast_precision_loss,
10 clippy::many_single_char_names,
11 clippy::doc_markdown
12)]
13
14use std::time::Instant;
15
16use difflib_fast::simjoin::{cosine_join, Corpus};
17#[cfg(feature = "profiling")]
18use difflib_fast::simjoin::cosine_join_counts;
19
20fn le_u32(b: &[u8], p: usize) -> u32 {
21 u32::from_le_bytes(b[p..p + 4].try_into().unwrap())
22}
23
24fn arg<T: std::str::FromStr>(i: usize, def: T) -> T {
25 std::env::args().nth(i).and_then(|s| s.parse().ok()).unwrap_or(def)
26}
27
28fn load(path: &str) -> Vec<Vec<(u32, f64)>> {
31 let b = std::fs::read(path).expect("read corpus");
32 assert!(b.len() >= 16, "corpus too small");
33 let magic = u64::from_le_bytes(b[0..8].try_into().unwrap());
34 assert_eq!(magic, 0x5349_4D4A_4F49_4E31, "bad magic");
35 let n = le_u32(&b, 8) as usize;
36 let mut p = 16usize;
37 let mut rows = Vec::with_capacity(n);
38 for _ in 0..n {
39 let nnz = le_u32(&b, p) as usize;
40 p += 4;
41 let mut row = Vec::with_capacity(nnz);
42 for _ in 0..nnz {
43 let d = le_u32(&b, p);
44 let w = f64::from(f32::from_le_bytes(b[p + 4..p + 8].try_into().unwrap()));
45 p += 8;
46 row.push((d, w));
47 }
48 rows.push(row);
49 }
50 rows
51}
52
53fn main() {
54 let path: String = arg(1, "perf-local/pypi-type3.simjoin.bin".to_string());
55 let t: f64 = arg(2, 0.8);
56 let reps: usize = arg(3, 3);
57
58 let mut rows = load(&path);
59 if let Ok(nsub) = std::env::var("SJ_NSUB") {
62 if let Ok(k) = nsub.parse::<usize>() {
63 if rows.len() > k {
64 rows.truncate(k);
65 }
66 }
67 }
68 let n = rows.len();
69 let nnz: usize = rows.iter().map(Vec::len).sum();
70
71 let b0 = Instant::now();
72 let corpus = Corpus::from_rows(&rows);
73 let build_ms = b0.elapsed().as_secs_f64() * 1000.0;
74
75 #[cfg(feature = "profiling")]
76 if std::env::var("STATS").is_ok() {
77 let (ncand, survivors, pairs) = cosine_join_counts(&corpus, t);
78 eprintln!(
79 "STATS n={n} t={t} | candidates={ncand} survivors(cos_full)={survivors} pairs={pairs} \
80 | prune_pass={:.4} survivor_precision={:.3}",
81 survivors as f64 / ncand.max(1) as f64,
82 pairs as f64 / survivors.max(1) as f64,
83 );
84 }
85
86 let mut ms: Vec<f64> = Vec::with_capacity(reps);
87 let mut npairs = 0usize;
88 for _ in 0..reps {
89 let t0 = Instant::now();
90 let pairs = cosine_join(&corpus, t);
91 ms.push(t0.elapsed().as_secs_f64() * 1000.0);
92 npairs = pairs.len();
93 std::hint::black_box(&pairs);
94 }
95 ms.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
96 eprintln!(
97 "pypi-type3 n={n} nnz_total={nnz} mean_nnz={:.1} t={t} | build={build_ms:.0}ms | \
98 join: min={:.1}ms median={:.1}ms | pairs={npairs}",
99 nnz as f64 / n as f64,
100 ms[0],
101 ms[reps / 2],
102 );
103}