pub struct Corpus { /* private fields */ }Expand description
A corpus of L2-normalised sparse non-negative vectors in CSR form, with dimensions relabelled to a global rank (decreasing max weight). Built once; joinable at any threshold.
Implementations§
Source§impl Corpus
impl Corpus
Sourcepub fn from_token_docs<S: AsRef<str>>(docs: &[Vec<S>]) -> Corpus
pub fn from_token_docs<S: AsRef<str>>(docs: &[Vec<S>]) -> Corpus
Build a corpus from token documents — each document a list of string tokens — as TF-IDF
sparse vectors: dim = a distinct token, weight = (token count in doc) × ln(n / df_token)
(df = number of documents containing the token). This is the principled input for a Type-3
code-clone join (documents = functions, tokens = canonicalised lines) and the shape most
callers actually have. A token appearing in every document gets idf = 0 (contributes
nothing), as expected.
Sourcepub fn from_rows(rows: &[Vec<(u32, f64)>]) -> Corpus
pub fn from_rows(rows: &[Vec<(u32, f64)>]) -> Corpus
Build a corpus from raw (dim, weight) rows. Duplicate dims within a row are summed; each row
is L2-normalised; dims are relabelled to a global rank by decreasing max weight. Weights are
expected non-negative (the prefix-filter bound requires it — IDF weights satisfy this).
Examples found in repository?
74fn main() {
75 let n: usize = arg(1, 100_000);
76 let nnz: usize = arg(2, 14);
77 let ndims: usize = arg(3, 20_000);
78 let t: f64 = arg(4, 0.7);
79 let reps: usize = arg(5, 3);
80
81 let rows = gen(n, nnz, ndims, 0x1234_5678_9abc_def1);
82 let build0 = Instant::now();
83 let corpus = Corpus::from_rows(&rows);
84 let build_ms = build0.elapsed().as_secs_f64() * 1000.0;
85
86 // Strategy diagnostic (profiling builds only): posting touches / candidates / pairs. The
87 // candidates-per-pair ratio decides whether to prune harder or speed the dot up.
88 #[cfg(feature = "profiling")]
89 if std::env::var("STATS").is_ok() {
90 let (ncand, survivors, pairs) = cosine_join_counts(&corpus, t);
91 eprintln!(
92 "STATS n={n} t={t} | candidates={ncand} survivors(cos_full)={survivors} pairs={pairs} \
93 | prune_pass={:.4} cos_full_saved={:.4} survivor_precision={:.3}",
94 survivors as f64 / ncand.max(1) as f64,
95 1.0 - survivors as f64 / ncand.max(1) as f64,
96 pairs as f64 / survivors.max(1) as f64,
97 );
98 }
99
100 let mut ms: Vec<f64> = Vec::with_capacity(reps);
101 let mut npairs = 0usize;
102 for _ in 0..reps {
103 let t0 = Instant::now();
104 let pairs = cosine_join(&corpus, t);
105 ms.push(t0.elapsed().as_secs_f64() * 1000.0);
106 npairs = pairs.len();
107 std::hint::black_box(&pairs);
108 }
109 ms.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
110 eprintln!(
111 "n={n} nnz={nnz} ndims={ndims} t={t} | build={build_ms:.0}ms | join: min={:.1}ms median={:.1}ms | pairs={npairs}",
112 ms[0],
113 ms[reps / 2],
114 );
115}More examples
53fn main() {
54 let path: String = arg(1, "perf-local/pypi-type3.simjoin.bin".to_string());
55 let t: f64 = arg(2, 0.8);
56 let reps: usize = arg(3, 3);
57
58 let mut rows = load(&path);
59 // Optional row subset (SJ_NSUB) to bench the small-corpus regime (e.g. find-dup-defs scale
60 // ~3216 functions) instead of the full 287k bandwidth-bound regime.
61 if let Ok(nsub) = std::env::var("SJ_NSUB") {
62 if let Ok(k) = nsub.parse::<usize>() {
63 if rows.len() > k {
64 rows.truncate(k);
65 }
66 }
67 }
68 let n = rows.len();
69 let nnz: usize = rows.iter().map(Vec::len).sum();
70
71 let b0 = Instant::now();
72 let corpus = Corpus::from_rows(&rows);
73 let build_ms = b0.elapsed().as_secs_f64() * 1000.0;
74
75 #[cfg(feature = "profiling")]
76 if std::env::var("STATS").is_ok() {
77 let (ncand, survivors, pairs) = cosine_join_counts(&corpus, t);
78 eprintln!(
79 "STATS n={n} t={t} | candidates={ncand} survivors(cos_full)={survivors} pairs={pairs} \
80 | prune_pass={:.4} survivor_precision={:.3}",
81 survivors as f64 / ncand.max(1) as f64,
82 pairs as f64 / survivors.max(1) as f64,
83 );
84 }
85
86 let mut ms: Vec<f64> = Vec::with_capacity(reps);
87 let mut npairs = 0usize;
88 for _ in 0..reps {
89 let t0 = Instant::now();
90 let pairs = cosine_join(&corpus, t);
91 ms.push(t0.elapsed().as_secs_f64() * 1000.0);
92 npairs = pairs.len();
93 std::hint::black_box(&pairs);
94 }
95 ms.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
96 eprintln!(
97 "pypi-type3 n={n} nnz_total={nnz} mean_nnz={:.1} t={t} | build={build_ms:.0}ms | \
98 join: min={:.1}ms median={:.1}ms | pairs={npairs}",
99 nnz as f64 / n as f64,
100 ms[0],
101 ms[reps / 2],
102 );
103}Auto Trait Implementations§
impl Freeze for Corpus
impl RefUnwindSafe for Corpus
impl Send for Corpus
impl Sync for Corpus
impl Unpin for Corpus
impl UnsafeUnpin for Corpus
impl UnwindSafe for Corpus
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read more