Skip to main content

sbom_tools/matching/
adaptive.rs

1//! Adaptive threshold adjustment for fuzzy matching.
2//!
3//! This module provides algorithms to automatically adjust matching thresholds
4//! based on the actual distribution of match scores in the data.
5//!
6//! # Problem
7//!
8//! A fixed threshold (e.g., 0.85) may be:
9//! - Too strict for SBOMs with many renamed packages (missing valid matches)
10//! - Too permissive for SBOMs with many similar-named packages (false positives)
11//!
12//! # Solution
13//!
14//! Adaptive thresholding analyzes the score distribution and finds an optimal
15//! threshold that maximizes true matches while minimizing false positives.
16//!
17//! # Algorithms
18//!
19//! - **Target Match Ratio**: Binary search for threshold yielding ~N% matches
20//! - **Otsu's Method**: Find threshold that maximizes between-class variance
21//! - **Knee/Elbow Detection**: Find threshold at score distribution inflection point
22
23use crate::matching::{ComponentMatcher, FuzzyMatchConfig, FuzzyMatcher};
24use crate::model::{Component, NormalizedSbom};
25
26/// Configuration for adaptive threshold adjustment.
27#[derive(Debug, Clone)]
28pub struct AdaptiveThresholdConfig {
29    /// Minimum allowed threshold (don't go below this)
30    pub min_threshold: f64,
31    /// Maximum allowed threshold (don't go above this)
32    pub max_threshold: f64,
33    /// Number of iterations for binary search
34    pub max_iterations: usize,
35    /// Target match ratio (0.0 - 1.0, where 1.0 means all components match)
36    pub target_match_ratio: Option<f64>,
37    /// Minimum number of samples needed for reliable estimation
38    pub min_samples: usize,
39    /// Precision for binary search convergence
40    pub precision: f64,
41}
42
43impl Default for AdaptiveThresholdConfig {
44    fn default() -> Self {
45        Self {
46            min_threshold: 0.50,
47            max_threshold: 0.99,
48            max_iterations: 20,
49            target_match_ratio: None, // Use Otsu by default
50            min_samples: 10,
51            precision: 0.01,
52        }
53    }
54}
55
56impl AdaptiveThresholdConfig {
57    /// Configure for target match ratio search.
58    pub fn for_target_ratio(ratio: f64) -> Self {
59        Self {
60            target_match_ratio: Some(ratio.clamp(0.0, 1.0)),
61            ..Default::default()
62        }
63    }
64
65    /// Configure for Otsu's method (automatic threshold).
66    pub fn otsu() -> Self {
67        Self {
68            target_match_ratio: None,
69            ..Default::default()
70        }
71    }
72}
73
74/// Result of adaptive threshold computation.
75#[derive(Debug, Clone)]
76pub struct AdaptiveThresholdResult {
77    /// The computed optimal threshold
78    pub threshold: f64,
79    /// Method used to compute the threshold
80    pub method: AdaptiveMethod,
81    /// Number of component pairs sampled
82    pub samples: usize,
83    /// Score statistics
84    pub score_stats: ScoreStats,
85    /// Match ratio at the computed threshold
86    pub match_ratio: f64,
87    /// Confidence in the result (0.0 - 1.0)
88    pub confidence: f64,
89}
90
91/// Method used for adaptive threshold computation.
92#[derive(Debug, Clone, Copy, PartialEq, Eq)]
93pub enum AdaptiveMethod {
94    /// Binary search for target match ratio
95    TargetRatio,
96    /// Otsu's method (maximizes between-class variance)
97    Otsu,
98    /// Fallback to default threshold
99    Default,
100}
101
102/// Statistics about the score distribution.
103#[derive(Debug, Clone)]
104pub struct ScoreStats {
105    /// Minimum score observed
106    pub min: f64,
107    /// Maximum score observed
108    pub max: f64,
109    /// Mean score
110    pub mean: f64,
111    /// Standard deviation
112    pub std_dev: f64,
113    /// Median score
114    pub median: f64,
115    /// Number of exact matches (score = 1.0)
116    pub exact_matches: usize,
117    /// Number of complete non-matches (score = 0.0)
118    pub zero_scores: usize,
119}
120
121impl ScoreStats {
122    /// Compute statistics from a set of scores.
123    pub fn from_scores(scores: &[f64]) -> Self {
124        if scores.is_empty() {
125            return Self {
126                min: 0.0,
127                max: 0.0,
128                mean: 0.0,
129                std_dev: 0.0,
130                median: 0.0,
131                exact_matches: 0,
132                zero_scores: 0,
133            };
134        }
135
136        let mut sorted = scores.to_vec();
137        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
138
139        let min = sorted[0];
140        let max = sorted[sorted.len() - 1];
141        let mean = scores.iter().sum::<f64>() / scores.len() as f64;
142        let median = if sorted.len().is_multiple_of(2) {
143            (sorted[sorted.len() / 2 - 1] + sorted[sorted.len() / 2]) / 2.0
144        } else {
145            sorted[sorted.len() / 2]
146        };
147
148        let variance =
149            scores.iter().map(|&s| (s - mean).powi(2)).sum::<f64>() / scores.len() as f64;
150        let std_dev = variance.sqrt();
151
152        let exact_matches = scores.iter().filter(|&&s| s >= 0.9999).count();
153        let zero_scores = scores.iter().filter(|&&s| s < 0.0001).count();
154
155        Self {
156            min,
157            max,
158            mean,
159            std_dev,
160            median,
161            exact_matches,
162            zero_scores,
163        }
164    }
165}
166
167/// Adaptive threshold adjuster.
168pub struct AdaptiveThreshold {
169    config: AdaptiveThresholdConfig,
170}
171
172impl AdaptiveThreshold {
173    /// Create a new adaptive threshold adjuster with the given config.
174    pub fn new(config: AdaptiveThresholdConfig) -> Self {
175        Self { config }
176    }
177
178    /// Compute the optimal threshold for matching between two SBOMs.
179    pub fn compute_threshold(
180        &self,
181        old_sbom: &NormalizedSbom,
182        new_sbom: &NormalizedSbom,
183        matcher: &FuzzyMatcher,
184    ) -> AdaptiveThresholdResult {
185        // Sample scores between components
186        let scores = self.sample_scores(old_sbom, new_sbom, matcher);
187
188        if scores.len() < self.config.min_samples {
189            // Not enough data, fall back to default
190            return AdaptiveThresholdResult {
191                threshold: matcher.config().threshold,
192                method: AdaptiveMethod::Default,
193                samples: scores.len(),
194                score_stats: ScoreStats::from_scores(&scores),
195                match_ratio: 0.0,
196                confidence: 0.0,
197            };
198        }
199
200        let stats = ScoreStats::from_scores(&scores);
201
202        // Choose method based on config
203        let (threshold, method) = if let Some(target_ratio) = self.config.target_match_ratio {
204            let t = self.binary_search_threshold(&scores, target_ratio);
205            (t, AdaptiveMethod::TargetRatio)
206        } else {
207            let t = self.otsu_threshold(&scores);
208            (t, AdaptiveMethod::Otsu)
209        };
210
211        // Clamp to configured bounds
212        let threshold = threshold.clamp(self.config.min_threshold, self.config.max_threshold);
213
214        // Compute match ratio at this threshold
215        let match_count = scores.iter().filter(|&&s| s >= threshold).count();
216        let match_ratio = match_count as f64 / scores.len() as f64;
217
218        // Estimate confidence based on sample size and score distribution
219        let confidence = self.estimate_confidence(&scores, &stats);
220
221        AdaptiveThresholdResult {
222            threshold,
223            method,
224            samples: scores.len(),
225            score_stats: stats,
226            match_ratio,
227            confidence,
228        }
229    }
230
231    /// Sample match scores between components of two SBOMs.
232    fn sample_scores(
233        &self,
234        old_sbom: &NormalizedSbom,
235        new_sbom: &NormalizedSbom,
236        matcher: &FuzzyMatcher,
237    ) -> Vec<f64> {
238        let mut scores = Vec::new();
239        let max_samples = 1000; // Limit for performance
240
241        let old_components: Vec<&Component> = old_sbom.components.values().collect();
242        let new_components: Vec<&Component> = new_sbom.components.values().collect();
243
244        // For each old component, find best match score in new SBOM
245        for old_comp in old_components.iter().take(max_samples) {
246            let best_score = new_components
247                .iter()
248                .map(|new_comp| matcher.match_score(old_comp, new_comp))
249                .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
250                .unwrap_or(0.0);
251
252            scores.push(best_score);
253        }
254
255        scores
256    }
257
258    /// Binary search for threshold that yields target match ratio.
259    fn binary_search_threshold(&self, scores: &[f64], target_ratio: f64) -> f64 {
260        let mut low = self.config.min_threshold;
261        let mut high = self.config.max_threshold;
262
263        for _ in 0..self.config.max_iterations {
264            let mid = (low + high) / 2.0;
265            let match_count = scores.iter().filter(|&&s| s >= mid).count();
266            let ratio = match_count as f64 / scores.len() as f64;
267
268            if (ratio - target_ratio).abs() < self.config.precision {
269                return mid;
270            }
271
272            if ratio > target_ratio {
273                // Too many matches, increase threshold
274                low = mid;
275            } else {
276                // Too few matches, decrease threshold
277                high = mid;
278            }
279        }
280
281        (low + high) / 2.0
282    }
283
284    /// Otsu's method: find threshold that maximizes between-class variance.
285    fn otsu_threshold(&self, scores: &[f64]) -> f64 {
286        // Discretize scores into bins
287        let num_bins = 100;
288        let mut histogram = vec![0usize; num_bins];
289
290        for &score in scores {
291            let bin = ((score * (num_bins - 1) as f64) as usize).min(num_bins - 1);
292            histogram[bin] += 1;
293        }
294
295        let total = scores.len() as f64;
296        let mut sum_total = 0.0;
297        for (i, &count) in histogram.iter().enumerate() {
298            sum_total += i as f64 * count as f64;
299        }
300
301        let mut first_optimal_bin = 0;
302        let mut last_optimal_bin = 0;
303        let mut best_variance = 0.0;
304        let mut sum_background = 0.0;
305        let mut weight_background = 0.0;
306        let variance_tolerance = 1e-6; // Allow for floating-point comparison
307
308        for (i, &count) in histogram.iter().enumerate() {
309            weight_background += count as f64;
310            if weight_background == 0.0 {
311                continue;
312            }
313
314            let weight_foreground = total - weight_background;
315            if weight_foreground == 0.0 {
316                break;
317            }
318
319            sum_background += i as f64 * count as f64;
320            let mean_background = sum_background / weight_background;
321            let mean_foreground = (sum_total - sum_background) / weight_foreground;
322
323            let between_variance =
324                weight_background * weight_foreground * (mean_background - mean_foreground).powi(2);
325
326            if between_variance > best_variance + variance_tolerance {
327                // Found a new best - reset the range
328                best_variance = between_variance;
329                first_optimal_bin = i;
330                last_optimal_bin = i;
331            } else if (between_variance - best_variance).abs() <= variance_tolerance {
332                // Same variance (within tolerance) - extend the range
333                last_optimal_bin = i;
334            }
335        }
336
337        // Use the middle of the optimal range for better practical thresholds
338        // This helps when there's a gap between clusters (bimodal distribution)
339        let middle_bin = (first_optimal_bin + last_optimal_bin) / 2;
340        (middle_bin as f64 + 0.5) / num_bins as f64
341    }
342
343    /// Estimate confidence in the computed threshold.
344    fn estimate_confidence(&self, scores: &[f64], stats: &ScoreStats) -> f64 {
345        // Factors that increase confidence:
346        // 1. More samples
347        // 2. Clear bimodal distribution (high std_dev)
348        // 3. Some exact matches (indicates correct matches exist)
349        // 4. Not all zero scores
350
351        let sample_factor = (scores.len() as f64 / 100.0).min(1.0);
352        let distribution_factor = (stats.std_dev * 3.0).min(1.0);
353        let exact_match_factor = if stats.exact_matches > 0 { 0.9 } else { 0.5 };
354        let zero_score_penalty = if stats.zero_scores == scores.len() {
355            0.0
356        } else {
357            1.0
358        };
359
360        (sample_factor * 0.3
361            + distribution_factor * 0.3
362            + exact_match_factor * 0.2
363            + zero_score_penalty * 0.2)
364            .clamp(0.0, 1.0)
365    }
366}
367
368impl Default for AdaptiveThreshold {
369    fn default() -> Self {
370        Self::new(AdaptiveThresholdConfig::default())
371    }
372}
373
374/// Extension trait for FuzzyMatcher to support adaptive thresholding.
375pub trait AdaptiveMatching {
376    /// Create a matcher with an adaptively computed threshold for the given SBOMs.
377    fn with_adaptive_threshold(
378        old_sbom: &NormalizedSbom,
379        new_sbom: &NormalizedSbom,
380        base_config: FuzzyMatchConfig,
381    ) -> (FuzzyMatcher, AdaptiveThresholdResult);
382
383    /// Compute and apply adaptive threshold to an existing matcher.
384    fn adapt_threshold(
385        &self,
386        old_sbom: &NormalizedSbom,
387        new_sbom: &NormalizedSbom,
388    ) -> AdaptiveThresholdResult;
389}
390
391impl AdaptiveMatching for FuzzyMatcher {
392    fn with_adaptive_threshold(
393        old_sbom: &NormalizedSbom,
394        new_sbom: &NormalizedSbom,
395        base_config: FuzzyMatchConfig,
396    ) -> (FuzzyMatcher, AdaptiveThresholdResult) {
397        let base_matcher = FuzzyMatcher::new(base_config.clone());
398        let adjuster = AdaptiveThreshold::default();
399        let result = adjuster.compute_threshold(old_sbom, new_sbom, &base_matcher);
400
401        // Create new matcher with adapted threshold
402        let mut adapted_config = base_config;
403        adapted_config.threshold = result.threshold;
404        let adapted_matcher = FuzzyMatcher::new(adapted_config);
405
406        (adapted_matcher, result)
407    }
408
409    fn adapt_threshold(
410        &self,
411        old_sbom: &NormalizedSbom,
412        new_sbom: &NormalizedSbom,
413    ) -> AdaptiveThresholdResult {
414        let adjuster = AdaptiveThreshold::default();
415        adjuster.compute_threshold(old_sbom, new_sbom, self)
416    }
417}
418
419#[cfg(test)]
420mod tests {
421    use super::*;
422    use crate::model::DocumentMetadata;
423
424    fn make_component(name: &str) -> Component {
425        Component::new(name.to_string(), format!("id-{}", name))
426    }
427
428    fn make_sbom_with_components(names: &[&str]) -> NormalizedSbom {
429        let mut sbom = NormalizedSbom::new(DocumentMetadata::default());
430        for name in names {
431            sbom.add_component(make_component(name));
432        }
433        sbom
434    }
435
436    #[test]
437    fn test_score_stats_computation() {
438        let scores = vec![0.0, 0.3, 0.5, 0.7, 1.0];
439        let stats = ScoreStats::from_scores(&scores);
440
441        assert_eq!(stats.min, 0.0);
442        assert_eq!(stats.max, 1.0);
443        assert!((stats.mean - 0.5).abs() < 0.01);
444        assert_eq!(stats.median, 0.5);
445        assert_eq!(stats.exact_matches, 1);
446        assert_eq!(stats.zero_scores, 1);
447    }
448
449    #[test]
450    fn test_score_stats_empty() {
451        let scores: Vec<f64> = vec![];
452        let stats = ScoreStats::from_scores(&scores);
453
454        assert_eq!(stats.min, 0.0);
455        assert_eq!(stats.max, 0.0);
456        assert_eq!(stats.mean, 0.0);
457    }
458
459    #[test]
460    fn test_adaptive_threshold_config() {
461        let config = AdaptiveThresholdConfig::for_target_ratio(0.7);
462        assert_eq!(config.target_match_ratio, Some(0.7));
463
464        let config = AdaptiveThresholdConfig::otsu();
465        assert!(config.target_match_ratio.is_none());
466    }
467
468    #[test]
469    fn test_adaptive_threshold_with_similar_sboms() {
470        let old = make_sbom_with_components(&[
471            "lodash", "react", "express", "axios", "moment", "webpack", "babel", "eslint",
472            "prettier", "jest",
473        ]);
474        let new = make_sbom_with_components(&[
475            "lodash", "react", "express", "axios", "dayjs", // moment -> dayjs
476            "webpack", "babel", "eslint", "prettier", "vitest", // jest -> vitest
477        ]);
478
479        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced());
480        let adjuster = AdaptiveThreshold::new(AdaptiveThresholdConfig::otsu());
481        let result = adjuster.compute_threshold(&old, &new, &matcher);
482
483        assert!(result.threshold >= 0.5 && result.threshold <= 0.99);
484        assert!(result.samples >= 10);
485        assert!(result.confidence > 0.0);
486    }
487
488    #[test]
489    fn test_adaptive_threshold_target_ratio() {
490        let old = make_sbom_with_components(&[
491            "package-a",
492            "package-b",
493            "package-c",
494            "package-d",
495            "package-e",
496            "package-f",
497            "package-g",
498            "package-h",
499            "package-i",
500            "package-j",
501        ]);
502        let new = make_sbom_with_components(&[
503            "package-a",
504            "package-b",
505            "package-c",
506            "different-1",
507            "different-2",
508            "different-3",
509            "different-4",
510            "different-5",
511            "different-6",
512            "different-7",
513        ]);
514
515        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced());
516        let config = AdaptiveThresholdConfig::for_target_ratio(0.3); // Target 30% matches
517        let adjuster = AdaptiveThreshold::new(config);
518        let result = adjuster.compute_threshold(&old, &new, &matcher);
519
520        assert_eq!(result.method, AdaptiveMethod::TargetRatio);
521    }
522
523    #[test]
524    fn test_adaptive_matching_trait() {
525        let old = make_sbom_with_components(&["lodash", "express", "react", "vue", "angular"]);
526        let new = make_sbom_with_components(&["lodash", "express", "react", "svelte", "solid"]);
527
528        let base_config = FuzzyMatchConfig::balanced();
529        let (adapted_matcher, result) =
530            FuzzyMatcher::with_adaptive_threshold(&old, &new, base_config);
531
532        // The adapted matcher should have the computed threshold
533        assert!((adapted_matcher.config().threshold - result.threshold).abs() < 0.001);
534    }
535
536    #[test]
537    fn test_binary_search_threshold() {
538        let scores: Vec<f64> = (0..100).map(|i| i as f64 / 100.0).collect();
539        let adjuster = AdaptiveThreshold::new(AdaptiveThresholdConfig::for_target_ratio(0.5));
540
541        let threshold = adjuster.binary_search_threshold(&scores, 0.5);
542
543        // Should find threshold around 0.5 where 50% of scores are >= threshold
544        assert!((threshold - 0.5).abs() < 0.1);
545    }
546
547    #[test]
548    fn test_otsu_threshold_bimodal() {
549        // Create bimodal distribution (two peaks) with deterministic values
550        // Using extreme separation to make Otsu clearly identify the gap
551        let mut scores = Vec::new();
552
553        // Low scores: all exactly 0.1
554        for _ in 0..50 {
555            scores.push(0.1);
556        }
557        // High scores: all exactly 0.9
558        for _ in 0..50 {
559            scores.push(0.9);
560        }
561
562        let adjuster = AdaptiveThreshold::default();
563        let threshold = adjuster.otsu_threshold(&scores);
564
565        // Otsu should find threshold between the two peaks (around 0.5)
566        // With perfectly bimodal data at 0.1 and 0.9, the optimal split is 0.5
567        assert!(
568            threshold > 0.2 && threshold < 0.8,
569            "Threshold {} should be between peaks (0.2-0.8)",
570            threshold
571        );
572    }
573}