Skip to main content

sbom_tools/matching/
adaptive.rs

1//! Adaptive threshold adjustment for fuzzy matching.
2//!
3//! This module provides algorithms to automatically adjust matching thresholds
4//! based on the actual distribution of match scores in the data.
5//!
6//! # Problem
7//!
8//! A fixed threshold (e.g., 0.85) may be:
9//! - Too strict for SBOMs with many renamed packages (missing valid matches)
10//! - Too permissive for SBOMs with many similar-named packages (false positives)
11//!
12//! # Solution
13//!
14//! Adaptive thresholding analyzes the score distribution and finds an optimal
15//! threshold that maximizes true matches while minimizing false positives.
16//!
17//! # Algorithms
18//!
19//! - **Target Match Ratio**: Binary search for threshold yielding ~N% matches
20//! - **Otsu's Method**: Find threshold that maximizes between-class variance
21//! - **Knee/Elbow Detection**: Find threshold at score distribution inflection point
22
23use crate::matching::{ComponentMatcher, FuzzyMatchConfig, FuzzyMatcher};
24use crate::model::{Component, NormalizedSbom};
25
26/// Configuration for adaptive threshold adjustment.
27#[derive(Debug, Clone)]
28pub struct AdaptiveThresholdConfig {
29    /// Minimum allowed threshold (don't go below this)
30    pub min_threshold: f64,
31    /// Maximum allowed threshold (don't go above this)
32    pub max_threshold: f64,
33    /// Number of iterations for binary search
34    pub max_iterations: usize,
35    /// Target match ratio (0.0 - 1.0, where 1.0 means all components match)
36    pub target_match_ratio: Option<f64>,
37    /// Minimum number of samples needed for reliable estimation
38    pub min_samples: usize,
39    /// Precision for binary search convergence
40    pub precision: f64,
41}
42
43impl Default for AdaptiveThresholdConfig {
44    fn default() -> Self {
45        Self {
46            min_threshold: 0.50,
47            max_threshold: 0.99,
48            max_iterations: 20,
49            target_match_ratio: None, // Use Otsu by default
50            min_samples: 10,
51            precision: 0.01,
52        }
53    }
54}
55
56impl AdaptiveThresholdConfig {
57    /// Configure for target match ratio search.
58    #[must_use]
59    pub fn for_target_ratio(ratio: f64) -> Self {
60        Self {
61            target_match_ratio: Some(ratio.clamp(0.0, 1.0)),
62            ..Default::default()
63        }
64    }
65
66    /// Configure for Otsu's method (automatic threshold).
67    #[must_use]
68    pub fn otsu() -> Self {
69        Self {
70            target_match_ratio: None,
71            ..Default::default()
72        }
73    }
74}
75
76/// Result of adaptive threshold computation.
77#[derive(Debug, Clone)]
78pub struct AdaptiveThresholdResult {
79    /// The computed optimal threshold
80    pub threshold: f64,
81    /// Method used to compute the threshold
82    pub method: AdaptiveMethod,
83    /// Number of component pairs sampled
84    pub samples: usize,
85    /// Score statistics
86    pub score_stats: ScoreStats,
87    /// Match ratio at the computed threshold
88    pub match_ratio: f64,
89    /// Confidence in the result (0.0 - 1.0)
90    pub confidence: f64,
91}
92
93/// Method used for adaptive threshold computation.
94#[derive(Debug, Clone, Copy, PartialEq, Eq)]
95#[non_exhaustive]
96pub enum AdaptiveMethod {
97    /// Binary search for target match ratio
98    TargetRatio,
99    /// Otsu's method (maximizes between-class variance)
100    Otsu,
101    /// Fallback to default threshold
102    Default,
103}
104
105/// Statistics about the score distribution.
106#[derive(Debug, Clone)]
107pub struct ScoreStats {
108    /// Minimum score observed
109    pub min: f64,
110    /// Maximum score observed
111    pub max: f64,
112    /// Mean score
113    pub mean: f64,
114    /// Standard deviation
115    pub std_dev: f64,
116    /// Median score
117    pub median: f64,
118    /// Number of exact matches (score = 1.0)
119    pub exact_matches: usize,
120    /// Number of complete non-matches (score = 0.0)
121    pub zero_scores: usize,
122}
123
124impl ScoreStats {
125    /// Compute statistics from a set of scores.
126    #[must_use]
127    pub fn from_scores(scores: &[f64]) -> Self {
128        if scores.is_empty() {
129            return Self {
130                min: 0.0,
131                max: 0.0,
132                mean: 0.0,
133                std_dev: 0.0,
134                median: 0.0,
135                exact_matches: 0,
136                zero_scores: 0,
137            };
138        }
139
140        let mut sorted = scores.to_vec();
141        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
142
143        let min = sorted[0];
144        let max = sorted[sorted.len() - 1];
145        let mean = scores.iter().sum::<f64>() / scores.len() as f64;
146        let median = if sorted.len().is_multiple_of(2) {
147            f64::midpoint(sorted[sorted.len() / 2 - 1], sorted[sorted.len() / 2])
148        } else {
149            sorted[sorted.len() / 2]
150        };
151
152        let variance =
153            scores.iter().map(|&s| (s - mean).powi(2)).sum::<f64>() / scores.len() as f64;
154        let std_dev = variance.sqrt();
155
156        let exact_matches = scores.iter().filter(|&&s| s >= 0.9999).count();
157        let zero_scores = scores.iter().filter(|&&s| s < 0.0001).count();
158
159        Self {
160            min,
161            max,
162            mean,
163            std_dev,
164            median,
165            exact_matches,
166            zero_scores,
167        }
168    }
169}
170
171/// Adaptive threshold adjuster.
172pub struct AdaptiveThreshold {
173    config: AdaptiveThresholdConfig,
174}
175
176impl AdaptiveThreshold {
177    /// Create a new adaptive threshold adjuster with the given config.
178    #[must_use]
179    pub const fn new(config: AdaptiveThresholdConfig) -> Self {
180        Self { config }
181    }
182
183    /// Compute the optimal threshold for matching between two SBOMs.
184    #[must_use]
185    pub fn compute_threshold(
186        &self,
187        old_sbom: &NormalizedSbom,
188        new_sbom: &NormalizedSbom,
189        matcher: &FuzzyMatcher,
190    ) -> AdaptiveThresholdResult {
191        // Sample scores between components
192        let scores = self.sample_scores(old_sbom, new_sbom, matcher);
193
194        if scores.len() < self.config.min_samples {
195            // Not enough data, fall back to default
196            return AdaptiveThresholdResult {
197                threshold: matcher.config().threshold,
198                method: AdaptiveMethod::Default,
199                samples: scores.len(),
200                score_stats: ScoreStats::from_scores(&scores),
201                match_ratio: 0.0,
202                confidence: 0.0,
203            };
204        }
205
206        let stats = ScoreStats::from_scores(&scores);
207
208        // Choose method based on config
209        let (threshold, method) = self.config.target_match_ratio.map_or_else(
210            || {
211                let t = self.otsu_threshold(&scores);
212                (t, AdaptiveMethod::Otsu)
213            },
214            |target_ratio| {
215                let t = self.binary_search_threshold(&scores, target_ratio);
216                (t, AdaptiveMethod::TargetRatio)
217            },
218        );
219
220        // Clamp to configured bounds
221        let threshold = threshold.clamp(self.config.min_threshold, self.config.max_threshold);
222
223        // Compute match ratio at this threshold
224        let match_count = scores.iter().filter(|&&s| s >= threshold).count();
225        let match_ratio = match_count as f64 / scores.len() as f64;
226
227        // Estimate confidence based on sample size and score distribution
228        let confidence = self.estimate_confidence(&scores, &stats);
229
230        AdaptiveThresholdResult {
231            threshold,
232            method,
233            samples: scores.len(),
234            score_stats: stats,
235            match_ratio,
236            confidence,
237        }
238    }
239
240    /// Sample match scores between components of two SBOMs.
241    fn sample_scores(
242        &self,
243        old_sbom: &NormalizedSbom,
244        new_sbom: &NormalizedSbom,
245        matcher: &FuzzyMatcher,
246    ) -> Vec<f64> {
247        let mut scores = Vec::new();
248        let max_samples = 1000; // Limit for performance
249
250        let old_components: Vec<&Component> = old_sbom.components.values().collect();
251        let new_components: Vec<&Component> = new_sbom.components.values().collect();
252
253        // For each old component, find best match score in new SBOM
254        for old_comp in old_components.iter().take(max_samples) {
255            let best_score = new_components
256                .iter()
257                .map(|new_comp| matcher.match_score(old_comp, new_comp))
258                .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
259                .unwrap_or(0.0);
260
261            scores.push(best_score);
262        }
263
264        scores
265    }
266
267    /// Binary search for threshold that yields target match ratio.
268    fn binary_search_threshold(&self, scores: &[f64], target_ratio: f64) -> f64 {
269        let mut low = self.config.min_threshold;
270        let mut high = self.config.max_threshold;
271
272        for _ in 0..self.config.max_iterations {
273            let mid = f64::midpoint(low, high);
274            let match_count = scores.iter().filter(|&&s| s >= mid).count();
275            let ratio = match_count as f64 / scores.len() as f64;
276
277            if (ratio - target_ratio).abs() < self.config.precision {
278                return mid;
279            }
280
281            if ratio > target_ratio {
282                // Too many matches, increase threshold
283                low = mid;
284            } else {
285                // Too few matches, decrease threshold
286                high = mid;
287            }
288        }
289
290        f64::midpoint(low, high)
291    }
292
293    /// Otsu's method: find threshold that maximizes between-class variance.
294    fn otsu_threshold(&self, scores: &[f64]) -> f64 {
295        // Discretize scores into bins
296        let num_bins = 100;
297        let mut histogram = vec![0usize; num_bins];
298
299        for &score in scores {
300            let bin = ((score * (num_bins - 1) as f64) as usize).min(num_bins - 1);
301            histogram[bin] += 1;
302        }
303
304        let total = scores.len() as f64;
305        let mut sum_total = 0.0;
306        for (i, &count) in histogram.iter().enumerate() {
307            sum_total += i as f64 * count as f64;
308        }
309
310        let mut first_optimal_bin = 0;
311        let mut last_optimal_bin = 0;
312        let mut best_variance = 0.0;
313        let mut sum_background = 0.0;
314        let mut weight_background = 0.0;
315        let variance_tolerance = 1e-6; // Allow for floating-point comparison
316
317        for (i, &count) in histogram.iter().enumerate() {
318            weight_background += count as f64;
319            if weight_background == 0.0 {
320                continue;
321            }
322
323            let weight_foreground = total - weight_background;
324            if weight_foreground == 0.0 {
325                break;
326            }
327
328            sum_background += i as f64 * count as f64;
329            let mean_background = sum_background / weight_background;
330            let mean_foreground = (sum_total - sum_background) / weight_foreground;
331
332            let between_variance =
333                weight_background * weight_foreground * (mean_background - mean_foreground).powi(2);
334
335            if between_variance > best_variance + variance_tolerance {
336                // Found a new best - reset the range
337                best_variance = between_variance;
338                first_optimal_bin = i;
339                last_optimal_bin = i;
340            } else if (between_variance - best_variance).abs() <= variance_tolerance {
341                // Same variance (within tolerance) - extend the range
342                last_optimal_bin = i;
343            }
344        }
345
346        // Use the middle of the optimal range for better practical thresholds
347        // This helps when there's a gap between clusters (bimodal distribution)
348        let middle_bin = usize::midpoint(first_optimal_bin, last_optimal_bin);
349        (middle_bin as f64 + 0.5) / num_bins as f64
350    }
351
352    /// Estimate confidence in the computed threshold.
353    fn estimate_confidence(&self, scores: &[f64], stats: &ScoreStats) -> f64 {
354        // Factors that increase confidence:
355        // 1. More samples
356        // 2. Clear bimodal distribution (high std_dev)
357        // 3. Some exact matches (indicates correct matches exist)
358        // 4. Not all zero scores
359
360        let sample_factor = (scores.len() as f64 / 100.0).min(1.0);
361        let distribution_factor = (stats.std_dev * 3.0).min(1.0);
362        let exact_match_factor = if stats.exact_matches > 0 { 0.9 } else { 0.5 };
363        let zero_score_penalty = if stats.zero_scores == scores.len() {
364            0.0
365        } else {
366            1.0
367        };
368
369        (sample_factor * 0.3
370            + distribution_factor * 0.3
371            + exact_match_factor * 0.2
372            + zero_score_penalty * 0.2)
373            .clamp(0.0, 1.0)
374    }
375}
376
377impl Default for AdaptiveThreshold {
378    fn default() -> Self {
379        Self::new(AdaptiveThresholdConfig::default())
380    }
381}
382
383/// Extension trait for `FuzzyMatcher` to support adaptive thresholding.
384pub trait AdaptiveMatching {
385    /// Create a matcher with an adaptively computed threshold for the given SBOMs.
386    fn with_adaptive_threshold(
387        old_sbom: &NormalizedSbom,
388        new_sbom: &NormalizedSbom,
389        base_config: FuzzyMatchConfig,
390    ) -> (FuzzyMatcher, AdaptiveThresholdResult);
391
392    /// Compute and apply adaptive threshold to an existing matcher.
393    fn adapt_threshold(
394        &self,
395        old_sbom: &NormalizedSbom,
396        new_sbom: &NormalizedSbom,
397    ) -> AdaptiveThresholdResult;
398}
399
400impl AdaptiveMatching for FuzzyMatcher {
401    fn with_adaptive_threshold(
402        old_sbom: &NormalizedSbom,
403        new_sbom: &NormalizedSbom,
404        base_config: FuzzyMatchConfig,
405    ) -> (FuzzyMatcher, AdaptiveThresholdResult) {
406        let base_matcher = Self::new(base_config.clone());
407        let adjuster = AdaptiveThreshold::default();
408        let result = adjuster.compute_threshold(old_sbom, new_sbom, &base_matcher);
409
410        // Create new matcher with adapted threshold
411        let mut adapted_config = base_config;
412        adapted_config.threshold = result.threshold;
413        let adapted_matcher = Self::new(adapted_config);
414
415        (adapted_matcher, result)
416    }
417
418    fn adapt_threshold(
419        &self,
420        old_sbom: &NormalizedSbom,
421        new_sbom: &NormalizedSbom,
422    ) -> AdaptiveThresholdResult {
423        let adjuster = AdaptiveThreshold::default();
424        adjuster.compute_threshold(old_sbom, new_sbom, self)
425    }
426}
427
428#[cfg(test)]
429mod tests {
430    use super::*;
431    use crate::model::DocumentMetadata;
432
433    fn make_component(name: &str) -> Component {
434        Component::new(name.to_string(), format!("id-{}", name))
435    }
436
437    fn make_sbom_with_components(names: &[&str]) -> NormalizedSbom {
438        let mut sbom = NormalizedSbom::new(DocumentMetadata::default());
439        for name in names {
440            sbom.add_component(make_component(name));
441        }
442        sbom
443    }
444
445    #[test]
446    fn test_score_stats_computation() {
447        let scores = vec![0.0, 0.3, 0.5, 0.7, 1.0];
448        let stats = ScoreStats::from_scores(&scores);
449
450        assert_eq!(stats.min, 0.0);
451        assert_eq!(stats.max, 1.0);
452        assert!((stats.mean - 0.5).abs() < 0.01);
453        assert_eq!(stats.median, 0.5);
454        assert_eq!(stats.exact_matches, 1);
455        assert_eq!(stats.zero_scores, 1);
456    }
457
458    #[test]
459    fn test_score_stats_empty() {
460        let scores: Vec<f64> = vec![];
461        let stats = ScoreStats::from_scores(&scores);
462
463        assert_eq!(stats.min, 0.0);
464        assert_eq!(stats.max, 0.0);
465        assert_eq!(stats.mean, 0.0);
466    }
467
468    #[test]
469    fn test_adaptive_threshold_config() {
470        let config = AdaptiveThresholdConfig::for_target_ratio(0.7);
471        assert_eq!(config.target_match_ratio, Some(0.7));
472
473        let config = AdaptiveThresholdConfig::otsu();
474        assert!(config.target_match_ratio.is_none());
475    }
476
477    #[test]
478    fn test_adaptive_threshold_with_similar_sboms() {
479        let old = make_sbom_with_components(&[
480            "lodash", "react", "express", "axios", "moment", "webpack", "babel", "eslint",
481            "prettier", "jest",
482        ]);
483        let new = make_sbom_with_components(&[
484            "lodash", "react", "express", "axios", "dayjs", // moment -> dayjs
485            "webpack", "babel", "eslint", "prettier", "vitest", // jest -> vitest
486        ]);
487
488        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced());
489        let adjuster = AdaptiveThreshold::new(AdaptiveThresholdConfig::otsu());
490        let result = adjuster.compute_threshold(&old, &new, &matcher);
491
492        assert!(result.threshold >= 0.5 && result.threshold <= 0.99);
493        assert!(result.samples >= 10);
494        assert!(result.confidence > 0.0);
495    }
496
497    #[test]
498    fn test_adaptive_threshold_target_ratio() {
499        let old = make_sbom_with_components(&[
500            "package-a",
501            "package-b",
502            "package-c",
503            "package-d",
504            "package-e",
505            "package-f",
506            "package-g",
507            "package-h",
508            "package-i",
509            "package-j",
510        ]);
511        let new = make_sbom_with_components(&[
512            "package-a",
513            "package-b",
514            "package-c",
515            "different-1",
516            "different-2",
517            "different-3",
518            "different-4",
519            "different-5",
520            "different-6",
521            "different-7",
522        ]);
523
524        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced());
525        let config = AdaptiveThresholdConfig::for_target_ratio(0.3); // Target 30% matches
526        let adjuster = AdaptiveThreshold::new(config);
527        let result = adjuster.compute_threshold(&old, &new, &matcher);
528
529        assert_eq!(result.method, AdaptiveMethod::TargetRatio);
530    }
531
532    #[test]
533    fn test_adaptive_matching_trait() {
534        let old = make_sbom_with_components(&["lodash", "express", "react", "vue", "angular"]);
535        let new = make_sbom_with_components(&["lodash", "express", "react", "svelte", "solid"]);
536
537        let base_config = FuzzyMatchConfig::balanced();
538        let (adapted_matcher, result) =
539            FuzzyMatcher::with_adaptive_threshold(&old, &new, base_config);
540
541        // The adapted matcher should have the computed threshold
542        assert!((adapted_matcher.config().threshold - result.threshold).abs() < 0.001);
543    }
544
545    #[test]
546    fn test_binary_search_threshold() {
547        let scores: Vec<f64> = (0..100).map(|i| i as f64 / 100.0).collect();
548        let adjuster = AdaptiveThreshold::new(AdaptiveThresholdConfig::for_target_ratio(0.5));
549
550        let threshold = adjuster.binary_search_threshold(&scores, 0.5);
551
552        // Should find threshold around 0.5 where 50% of scores are >= threshold
553        assert!((threshold - 0.5).abs() < 0.1);
554    }
555
556    #[test]
557    fn test_otsu_threshold_bimodal() {
558        // Create bimodal distribution (two peaks) with deterministic values
559        // Using extreme separation to make Otsu clearly identify the gap
560        let mut scores = vec![0.1; 50];
561        // High scores: all exactly 0.9
562        scores.extend(std::iter::repeat_n(0.9, 50));
563
564        let adjuster = AdaptiveThreshold::default();
565        let threshold = adjuster.otsu_threshold(&scores);
566
567        // Otsu should find threshold between the two peaks (around 0.5)
568        // With perfectly bimodal data at 0.1 and 0.9, the optimal split is 0.5
569        assert!(
570            threshold > 0.2 && threshold < 0.8,
571            "Threshold {} should be between peaks (0.2-0.8)",
572            threshold
573        );
574    }
575}