simjoin_pypi/
simjoin_pypi.rs1#![allow(
8 clippy::cast_possible_truncation,
9 clippy::cast_precision_loss,
10 clippy::many_single_char_names,
11 clippy::doc_markdown
12)]
13
14use std::time::Instant;
15
16use difflib_fast::simjoin::{cosine_join, Corpus};
17#[cfg(feature = "profiling")]
18use difflib_fast::simjoin::cosine_join_counts;
19
20fn le_u32(b: &[u8], p: usize) -> u32 {
21 u32::from_le_bytes(b[p..p + 4].try_into().unwrap())
22}
23
24fn arg<T: std::str::FromStr>(i: usize, def: T) -> T {
25 std::env::args().nth(i).and_then(|s| s.parse().ok()).unwrap_or(def)
26}
27
28fn load(path: &str) -> Vec<Vec<(u32, f64)>> {
31 let b = std::fs::read(path).expect("read corpus");
32 assert!(b.len() >= 16, "corpus too small");
33 let magic = u64::from_le_bytes(b[0..8].try_into().unwrap());
34 assert_eq!(magic, 0x5349_4D4A_4F49_4E31, "bad magic");
35 let n = le_u32(&b, 8) as usize;
36 let mut p = 16usize;
37 let mut rows = Vec::with_capacity(n);
38 for _ in 0..n {
39 let nnz = le_u32(&b, p) as usize;
40 p += 4;
41 let mut row = Vec::with_capacity(nnz);
42 for _ in 0..nnz {
43 let d = le_u32(&b, p);
44 let w = f64::from(f32::from_le_bytes(b[p + 4..p + 8].try_into().unwrap()));
45 p += 8;
46 row.push((d, w));
47 }
48 rows.push(row);
49 }
50 rows
51}
52
53fn main() {
54 let path: String = arg(1, "perf-local/pypi-type3.simjoin.bin".to_string());
55 let t: f64 = arg(2, 0.8);
56 let reps: usize = arg(3, 3);
57
58 let rows = load(&path);
59 let n = rows.len();
60 let nnz: usize = rows.iter().map(Vec::len).sum();
61
62 let b0 = Instant::now();
63 let corpus = Corpus::from_rows(&rows);
64 let build_ms = b0.elapsed().as_secs_f64() * 1000.0;
65
66 #[cfg(feature = "profiling")]
67 if std::env::var("STATS").is_ok() {
68 let (ncand, survivors, pairs) = cosine_join_counts(&corpus, t);
69 eprintln!(
70 "STATS n={n} t={t} | candidates={ncand} survivors(cos_full)={survivors} pairs={pairs} \
71 | prune_pass={:.4} survivor_precision={:.3}",
72 survivors as f64 / ncand.max(1) as f64,
73 pairs as f64 / survivors.max(1) as f64,
74 );
75 }
76
77 let mut ms: Vec<f64> = Vec::with_capacity(reps);
78 let mut npairs = 0usize;
79 for _ in 0..reps {
80 let t0 = Instant::now();
81 let pairs = cosine_join(&corpus, t);
82 ms.push(t0.elapsed().as_secs_f64() * 1000.0);
83 npairs = pairs.len();
84 std::hint::black_box(&pairs);
85 }
86 ms.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
87 eprintln!(
88 "pypi-type3 n={n} nnz_total={nnz} mean_nnz={:.1} t={t} | build={build_ms:.0}ms | \
89 join: min={:.1}ms median={:.1}ms | pairs={npairs}",
90 nnz as f64 / n as f64,
91 ms[0],
92 ms[reps / 2],
93 );
94}