sbom-tools 0.1.19

Semantic SBOM diff and analysis tool
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
//! Locality-Sensitive Hashing (LSH) for approximate nearest neighbor search.
//!
//! This module provides `MinHash` LSH for efficient similarity search on large SBOMs
//! (10,000+ components). It trades some accuracy for dramatic speed improvements
//! by using hash-based approximate matching.
//!
//! # How it works
//!
//! 1. Each component name is converted to a set of character shingles (n-grams)
//! 2. `MinHash` signatures are computed for each shingle set
//! 3. Signatures are divided into bands and hashed into buckets
//! 4. Components in the same bucket are candidate matches
//!
//! # Performance
//!
//! - Build time: O(n × k) where k = signature size
//! - Query time: O(1) average for bucket lookup + O(m) for candidates
//! - Space: O(n × k) for signatures

use super::index::ComponentIndex;
use crate::model::{CanonicalId, Component, NormalizedSbom};
use std::collections::{HashMap, HashSet};
use std::hash::{Hash, Hasher};

/// Configuration for LSH index.
#[derive(Debug, Clone)]
pub struct LshConfig {
    /// Number of hash functions in the `MinHash` signature
    pub num_hashes: usize,
    /// Number of bands to divide the signature into
    pub num_bands: usize,
    /// Size of character shingles (n-grams)
    pub shingle_size: usize,
    /// Minimum Jaccard similarity threshold this config is tuned for
    pub target_threshold: f64,
    /// Include ecosystem as a token in shingles (improves grouping by ecosystem)
    pub include_ecosystem_token: bool,
    /// Include group/namespace as a token in shingles (useful for Maven, npm scopes)
    pub include_group_token: bool,
}

impl LshConfig {
    /// Create a config tuned for the given similarity threshold.
    ///
    /// The number of bands and rows are chosen to maximize the probability
    /// of finding pairs with similarity >= threshold while minimizing false positives.
    #[must_use]
    pub fn for_threshold(threshold: f64) -> Self {
        // For threshold t, optimal parameters satisfy: t ≈ (1/b)^(1/r)
        // where b = bands, r = rows per band, and b × r = num_hashes
        //
        // Common configurations:
        // - t=0.5: b=20, r=5 (100 hashes)
        // - t=0.8: b=50, r=2 (100 hashes)
        // - t=0.9: b=90, r=1 (90 hashes) - but this is just exact bucketing

        let (num_bands, rows_per_band) = if threshold >= 0.9 {
            (50, 2) // 100 hashes, catches ~90%+ similar
        } else if threshold >= 0.8 {
            (25, 4) // 100 hashes, catches ~80%+ similar
        } else if threshold >= 0.7 {
            (20, 5) // 100 hashes, catches ~70%+ similar
        } else if threshold >= 0.5 {
            (10, 10) // 100 hashes, catches ~50%+ similar
        } else {
            (5, 20) // 100 hashes, very permissive
        };

        Self {
            num_hashes: num_bands * rows_per_band,
            num_bands,
            shingle_size: 3, // Trigrams work well for package names
            target_threshold: threshold,
            include_ecosystem_token: true, // Helps group by ecosystem
            include_group_token: false,    // Optional, disabled by default
        }
    }

    /// Default config for balanced matching (~0.8 threshold).
    #[must_use]
    pub fn default_balanced() -> Self {
        Self::for_threshold(0.8)
    }

    /// Config for strict matching (~0.9 threshold).
    #[must_use]
    pub fn strict() -> Self {
        Self::for_threshold(0.9)
    }

    /// Config for permissive matching (~0.5 threshold).
    #[must_use]
    pub fn permissive() -> Self {
        Self::for_threshold(0.5)
    }

    /// Get rows per band (signature elements per band).
    #[must_use]
    pub const fn rows_per_band(&self) -> usize {
        self.num_hashes / self.num_bands
    }
}

impl Default for LshConfig {
    fn default() -> Self {
        Self::default_balanced()
    }
}

/// `MinHash` signature for a component.
#[derive(Debug, Clone)]
pub struct MinHashSignature {
    /// The hash values (one per hash function)
    pub values: Vec<u64>,
}

impl MinHashSignature {
    /// Compute the estimated Jaccard similarity between two signatures.
    #[must_use]
    pub fn estimated_similarity(&self, other: &Self) -> f64 {
        if self.values.len() != other.values.len() {
            return 0.0;
        }

        let matching = self
            .values
            .iter()
            .zip(other.values.iter())
            .filter(|(a, b)| a == b)
            .count();

        matching as f64 / self.values.len() as f64
    }
}

/// LSH index for efficient approximate nearest neighbor search.
pub struct LshIndex {
    /// Configuration
    config: LshConfig,
    /// `MinHash` signatures for each component
    signatures: HashMap<CanonicalId, MinHashSignature>,
    /// Band buckets: `band_index` -> `bucket_hash` -> component IDs
    buckets: Vec<HashMap<u64, Vec<CanonicalId>>>,
    /// Hash coefficients for `MinHash` (a, b pairs for h(x) = (ax + b) mod p)
    hash_coeffs: Vec<(u64, u64)>,
    /// Large prime for hashing
    prime: u64,
}

impl LshIndex {
    /// Create a new LSH index with the given configuration.
    #[must_use]
    pub fn new(config: LshConfig) -> Self {
        use std::collections::hash_map::RandomState;
        use std::hash::BuildHasher;

        // Generate random hash coefficients
        let mut hash_coeffs = Vec::with_capacity(config.num_hashes);
        let random_state = RandomState::new();

        for i in 0..config.num_hashes {
            let a = random_state.hash_one(i as u64 * 31337) | 1; // Ensure odd (coprime with 2^64)

            let b = random_state.hash_one(i as u64 * 7919 + 12345);

            hash_coeffs.push((a, b));
        }

        // Initialize empty buckets for each band
        let buckets = (0..config.num_bands)
            .map(|_| HashMap::with_capacity(64))
            .collect();

        Self {
            config,
            signatures: HashMap::with_capacity(256),
            buckets,
            hash_coeffs,
            prime: 0xFFFF_FFFF_FFFF_FFC5, // Large prime close to 2^64
        }
    }

    /// Build an LSH index from an SBOM.
    #[must_use]
    pub fn build(sbom: &NormalizedSbom, config: LshConfig) -> Self {
        let mut index = Self::new(config);

        for (id, comp) in &sbom.components {
            index.insert(id.clone(), comp);
        }

        index
    }

    /// Insert a component into the index.
    pub fn insert(&mut self, id: CanonicalId, component: &Component) {
        // Compute shingles from the component (uses ecosystem-aware normalization)
        let shingles = self.compute_shingles(component);

        // Compute MinHash signature
        let signature = self.compute_minhash(&shingles);

        // Insert into band buckets
        self.insert_into_buckets(&id, &signature);

        // Store signature
        self.signatures.insert(id, signature);
    }

    /// Find candidate matches for a component.
    ///
    /// Returns component IDs that are likely similar based on LSH buckets.
    /// These candidates should be verified with exact similarity computation.
    #[must_use]
    pub fn find_candidates(&self, component: &Component) -> Vec<CanonicalId> {
        let shingles = self.compute_shingles(component);
        let signature = self.compute_minhash(&shingles);

        self.find_candidates_by_signature(&signature)
    }

    /// Find candidates using a pre-computed signature.
    #[must_use]
    pub fn find_candidates_by_signature(&self, signature: &MinHashSignature) -> Vec<CanonicalId> {
        let mut candidates = HashSet::new();
        let rows_per_band = self.config.rows_per_band();

        for (band_idx, bucket_map) in self.buckets.iter().enumerate() {
            let band_hash = self.hash_band(signature, band_idx, rows_per_band);

            if let Some(ids) = bucket_map.get(&band_hash) {
                for id in ids {
                    candidates.insert(id.clone());
                }
            }
        }

        candidates.into_iter().collect()
    }

    /// Find candidates for a component from another index.
    ///
    /// Useful for diffing: build index from new SBOM, query with old SBOM components.
    pub fn find_candidates_for_id(&self, id: &CanonicalId) -> Vec<CanonicalId> {
        self.signatures.get(id).map_or_else(Vec::new, |signature| {
            self.find_candidates_by_signature(signature)
        })
    }

    /// Get the `MinHash` signature for a component.
    #[must_use]
    pub fn get_signature(&self, id: &CanonicalId) -> Option<&MinHashSignature> {
        self.signatures.get(id)
    }

    /// Estimate similarity between two components in the index.
    #[must_use]
    pub fn estimate_similarity(&self, id_a: &CanonicalId, id_b: &CanonicalId) -> Option<f64> {
        let sig_a = self.signatures.get(id_a)?;
        let sig_b = self.signatures.get(id_b)?;
        Some(sig_a.estimated_similarity(sig_b))
    }

    /// Get statistics about the index.
    pub fn stats(&self) -> LshIndexStats {
        let total_components = self.signatures.len();
        let total_buckets: usize = self
            .buckets
            .iter()
            .map(std::collections::HashMap::len)
            .sum();
        let max_bucket_size = self
            .buckets
            .iter()
            .flat_map(|b| b.values())
            .map(std::vec::Vec::len)
            .max()
            .unwrap_or(0);
        let avg_bucket_size = if total_buckets > 0 {
            self.buckets
                .iter()
                .flat_map(|b| b.values())
                .map(std::vec::Vec::len)
                .sum::<usize>() as f64
                / total_buckets as f64
        } else {
            0.0
        };

        LshIndexStats {
            total_components,
            num_bands: self.config.num_bands,
            num_hashes: self.config.num_hashes,
            total_buckets,
            max_bucket_size,
            avg_bucket_size,
        }
    }

    /// Compute character shingles (n-grams) from a component.
    ///
    /// Uses ecosystem-aware normalization from `ComponentIndex` for consistent
    /// shingling across `PyPI`, Cargo, npm, etc. Also adds optional ecosystem
    /// and group tokens to improve candidate grouping.
    fn compute_shingles(&self, component: &Component) -> HashSet<u64> {
        // Get ecosystem for normalization
        let ecosystem = component
            .ecosystem
            .as_ref()
            .map(std::string::ToString::to_string);
        let ecosystem_str = ecosystem.as_deref();

        // Use ComponentIndex's normalization for consistency
        let normalized = ComponentIndex::normalize_name(&component.name, ecosystem_str);
        let chars: Vec<char> = normalized.chars().collect();

        // Estimate capacity: roughly (len - shingle_size + 1) shingles + 2 tokens
        let estimated_shingles = chars.len().saturating_sub(self.config.shingle_size) + 3;
        let mut shingles = HashSet::with_capacity(estimated_shingles);

        // Compute name shingles
        if chars.len() < self.config.shingle_size {
            // For very short names, use the whole name as a shingle
            let mut hasher = std::collections::hash_map::DefaultHasher::new();
            normalized.hash(&mut hasher);
            shingles.insert(hasher.finish());
        } else {
            // Hash character windows directly without allocating intermediate strings
            for window in chars.windows(self.config.shingle_size) {
                let mut hasher = std::collections::hash_map::DefaultHasher::new();
                window.hash(&mut hasher);
                shingles.insert(hasher.finish());
            }
        }

        // Add ecosystem token (helps group components by ecosystem)
        if self.config.include_ecosystem_token
            && let Some(ref eco) = ecosystem
        {
            let mut hasher = std::collections::hash_map::DefaultHasher::new();
            "__eco:".hash(&mut hasher);
            eco.to_lowercase().hash(&mut hasher);
            shingles.insert(hasher.finish());
        }

        // Add group/namespace token (useful for Maven group IDs, npm scopes)
        if self.config.include_group_token
            && let Some(ref group) = component.group
        {
            let mut hasher = std::collections::hash_map::DefaultHasher::new();
            "__grp:".hash(&mut hasher);
            group.to_lowercase().hash(&mut hasher);
            shingles.insert(hasher.finish());
        }

        shingles
    }

    /// Compute `MinHash` signature from shingles.
    fn compute_minhash(&self, shingles: &HashSet<u64>) -> MinHashSignature {
        let mut min_hashes = vec![u64::MAX; self.config.num_hashes];

        for &shingle in shingles {
            for (i, &(a, b)) in self.hash_coeffs.iter().enumerate() {
                // h_i(x) = (a*x + b) mod prime
                let hash = a.wrapping_mul(shingle).wrapping_add(b) % self.prime;
                if hash < min_hashes[i] {
                    min_hashes[i] = hash;
                }
            }
        }

        MinHashSignature { values: min_hashes }
    }

    /// Insert a signature into band buckets.
    fn insert_into_buckets(&mut self, id: &CanonicalId, signature: &MinHashSignature) {
        let rows_per_band = self.config.rows_per_band();

        // Pre-compute all band hashes to avoid borrow conflicts
        let band_hashes: Vec<u64> = (0..self.config.num_bands)
            .map(|band_idx| self.hash_band(signature, band_idx, rows_per_band))
            .collect();

        for (band_idx, bucket_map) in self.buckets.iter_mut().enumerate() {
            bucket_map
                .entry(band_hashes[band_idx])
                .or_default()
                .push(id.clone());
        }
    }

    /// Hash a band of the signature.
    fn hash_band(
        &self,
        signature: &MinHashSignature,
        band_idx: usize,
        rows_per_band: usize,
    ) -> u64 {
        let start = band_idx * rows_per_band;
        let end = (start + rows_per_band).min(signature.values.len());

        let mut hasher = std::collections::hash_map::DefaultHasher::new();
        for &value in &signature.values[start..end] {
            value.hash(&mut hasher);
        }
        hasher.finish()
    }
}

/// Statistics about an LSH index.
#[derive(Debug, Clone)]
pub struct LshIndexStats {
    /// Total number of indexed components
    pub total_components: usize,
    /// Number of bands
    pub num_bands: usize,
    /// Total number of hash functions
    pub num_hashes: usize,
    /// Total number of non-empty buckets
    pub total_buckets: usize,
    /// Maximum components in a single bucket
    pub max_bucket_size: usize,
    /// Average components per bucket
    pub avg_bucket_size: f64,
}

impl std::fmt::Display for LshIndexStats {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "LSH Index: {} components, {} bands × {} hashes, {} buckets (max: {}, avg: {:.1})",
            self.total_components,
            self.num_bands,
            self.num_hashes / self.num_bands,
            self.total_buckets,
            self.max_bucket_size,
            self.avg_bucket_size
        )
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::model::DocumentMetadata;

    fn make_component(name: &str) -> Component {
        Component::new(name.to_string(), format!("id-{}", name))
    }

    #[test]
    fn test_lsh_config_for_threshold() {
        let config = LshConfig::for_threshold(0.8);
        assert_eq!(config.num_hashes, 100);
        assert!(config.num_bands > 0);
        assert_eq!(config.num_hashes, config.num_bands * config.rows_per_band());
    }

    #[test]
    fn test_minhash_signature_similarity() {
        let sig_a = MinHashSignature {
            values: vec![1, 2, 3, 4, 5],
        };
        let sig_b = MinHashSignature {
            values: vec![1, 2, 3, 4, 5],
        };
        assert_eq!(sig_a.estimated_similarity(&sig_b), 1.0);

        let sig_c = MinHashSignature {
            values: vec![1, 2, 3, 6, 7],
        };
        assert!((sig_a.estimated_similarity(&sig_c) - 0.6).abs() < 0.01);
    }

    #[test]
    fn test_lsh_index_build() {
        let mut sbom = NormalizedSbom::new(DocumentMetadata::default());
        sbom.add_component(make_component("lodash"));
        sbom.add_component(make_component("lodash-es"));
        sbom.add_component(make_component("underscore"));
        sbom.add_component(make_component("react"));

        let index = LshIndex::build(&sbom, LshConfig::default_balanced());
        let stats = index.stats();

        assert_eq!(stats.total_components, 4);
        assert!(stats.total_buckets > 0);
    }

    #[test]
    fn test_lsh_finds_similar_names() {
        let mut sbom = NormalizedSbom::new(DocumentMetadata::default());
        sbom.add_component(make_component("lodash"));
        sbom.add_component(make_component("lodash-es"));
        sbom.add_component(make_component("lodash-fp"));
        sbom.add_component(make_component("react"));
        sbom.add_component(make_component("angular"));

        let index = LshIndex::build(&sbom, LshConfig::for_threshold(0.5));

        // Query for similar to "lodash"
        let query = make_component("lodash");
        let candidates = index.find_candidates(&query);

        // Should find lodash variants as candidates
        // Note: LSH is probabilistic, so we check for likely outcomes
        assert!(
            !candidates.is_empty(),
            "Should find at least some candidates"
        );
    }

    #[test]
    fn test_lsh_signature_estimation() {
        let mut sbom = NormalizedSbom::new(DocumentMetadata::default());

        let comp1 = make_component("lodash");
        let comp2 = make_component("lodash-es");
        let comp3 = make_component("completely-different-name");

        let id1 = comp1.canonical_id.clone();
        let id2 = comp2.canonical_id.clone();
        let id3 = comp3.canonical_id.clone();

        sbom.add_component(comp1);
        sbom.add_component(comp2);
        sbom.add_component(comp3);

        let index = LshIndex::build(&sbom, LshConfig::default_balanced());

        // Similar names should have higher estimated similarity
        let sim_12 = index.estimate_similarity(&id1, &id2).unwrap();
        let sim_13 = index.estimate_similarity(&id1, &id3).unwrap();

        assert!(
            sim_12 > sim_13,
            "lodash vs lodash-es ({:.2}) should be more similar than lodash vs completely-different ({:.2})",
            sim_12,
            sim_13
        );
    }

    #[test]
    fn test_lsh_index_stats() {
        let config = LshConfig::for_threshold(0.8);
        let index = LshIndex::new(config);

        let stats = index.stats();
        assert_eq!(stats.total_components, 0);
        assert_eq!(stats.num_bands, 25);
        assert_eq!(stats.num_hashes, 100);
    }
}