Skip to main content

sbom_tools/matching/
adaptive.rs

1//! Adaptive threshold adjustment for fuzzy matching.
2//!
3//! This module provides algorithms to automatically adjust matching thresholds
4//! based on the actual distribution of match scores in the data.
5//!
6//! # Problem
7//!
8//! A fixed threshold (e.g., 0.85) may be:
9//! - Too strict for SBOMs with many renamed packages (missing valid matches)
10//! - Too permissive for SBOMs with many similar-named packages (false positives)
11//!
12//! # Solution
13//!
14//! Adaptive thresholding analyzes the score distribution and finds an optimal
15//! threshold that maximizes true matches while minimizing false positives.
16//!
17//! # Algorithms
18//!
19//! - **Target Match Ratio**: Binary search for threshold yielding ~N% matches
20//! - **Otsu's Method**: Find threshold that maximizes between-class variance
21//! - **Knee/Elbow Detection**: Find threshold at score distribution inflection point
22
23use crate::matching::{ComponentMatcher, FuzzyMatchConfig, FuzzyMatcher};
24use crate::model::{Component, NormalizedSbom};
25
26/// Configuration for adaptive threshold adjustment.
27#[derive(Debug, Clone)]
28pub struct AdaptiveThresholdConfig {
29    /// Minimum allowed threshold (don't go below this)
30    pub min_threshold: f64,
31    /// Maximum allowed threshold (don't go above this)
32    pub max_threshold: f64,
33    /// Number of iterations for binary search
34    pub max_iterations: usize,
35    /// Target match ratio (0.0 - 1.0, where 1.0 means all components match)
36    pub target_match_ratio: Option<f64>,
37    /// Minimum number of samples needed for reliable estimation
38    pub min_samples: usize,
39    /// Precision for binary search convergence
40    pub precision: f64,
41}
42
43impl Default for AdaptiveThresholdConfig {
44    fn default() -> Self {
45        Self {
46            min_threshold: 0.50,
47            max_threshold: 0.99,
48            max_iterations: 20,
49            target_match_ratio: None, // Use Otsu by default
50            min_samples: 10,
51            precision: 0.01,
52        }
53    }
54}
55
56impl AdaptiveThresholdConfig {
57    /// Configure for target match ratio search.
58    #[must_use]
59    pub fn for_target_ratio(ratio: f64) -> Self {
60        Self {
61            target_match_ratio: Some(ratio.clamp(0.0, 1.0)),
62            ..Default::default()
63        }
64    }
65
66    /// Configure for Otsu's method (automatic threshold).
67    #[must_use]
68    pub fn otsu() -> Self {
69        Self {
70            target_match_ratio: None,
71            ..Default::default()
72        }
73    }
74}
75
76/// Result of adaptive threshold computation.
77#[derive(Debug, Clone)]
78pub struct AdaptiveThresholdResult {
79    /// The computed optimal threshold
80    pub threshold: f64,
81    /// Method used to compute the threshold
82    pub method: AdaptiveMethod,
83    /// Number of component pairs sampled
84    pub samples: usize,
85    /// Score statistics
86    pub score_stats: ScoreStats,
87    /// Match ratio at the computed threshold
88    pub match_ratio: f64,
89    /// Confidence in the result (0.0 - 1.0)
90    pub confidence: f64,
91}
92
93/// Method used for adaptive threshold computation.
94#[derive(Debug, Clone, Copy, PartialEq, Eq)]
95pub enum AdaptiveMethod {
96    /// Binary search for target match ratio
97    TargetRatio,
98    /// Otsu's method (maximizes between-class variance)
99    Otsu,
100    /// Fallback to default threshold
101    Default,
102}
103
104/// Statistics about the score distribution.
105#[derive(Debug, Clone)]
106pub struct ScoreStats {
107    /// Minimum score observed
108    pub min: f64,
109    /// Maximum score observed
110    pub max: f64,
111    /// Mean score
112    pub mean: f64,
113    /// Standard deviation
114    pub std_dev: f64,
115    /// Median score
116    pub median: f64,
117    /// Number of exact matches (score = 1.0)
118    pub exact_matches: usize,
119    /// Number of complete non-matches (score = 0.0)
120    pub zero_scores: usize,
121}
122
123impl ScoreStats {
124    /// Compute statistics from a set of scores.
125    #[must_use]
126    pub fn from_scores(scores: &[f64]) -> Self {
127        if scores.is_empty() {
128            return Self {
129                min: 0.0,
130                max: 0.0,
131                mean: 0.0,
132                std_dev: 0.0,
133                median: 0.0,
134                exact_matches: 0,
135                zero_scores: 0,
136            };
137        }
138
139        let mut sorted = scores.to_vec();
140        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
141
142        let min = sorted[0];
143        let max = sorted[sorted.len() - 1];
144        let mean = scores.iter().sum::<f64>() / scores.len() as f64;
145        let median = if sorted.len().is_multiple_of(2) {
146            f64::midpoint(sorted[sorted.len() / 2 - 1], sorted[sorted.len() / 2])
147        } else {
148            sorted[sorted.len() / 2]
149        };
150
151        let variance =
152            scores.iter().map(|&s| (s - mean).powi(2)).sum::<f64>() / scores.len() as f64;
153        let std_dev = variance.sqrt();
154
155        let exact_matches = scores.iter().filter(|&&s| s >= 0.9999).count();
156        let zero_scores = scores.iter().filter(|&&s| s < 0.0001).count();
157
158        Self {
159            min,
160            max,
161            mean,
162            std_dev,
163            median,
164            exact_matches,
165            zero_scores,
166        }
167    }
168}
169
170/// Adaptive threshold adjuster.
171pub struct AdaptiveThreshold {
172    config: AdaptiveThresholdConfig,
173}
174
175impl AdaptiveThreshold {
176    /// Create a new adaptive threshold adjuster with the given config.
177    #[must_use]
178    pub const fn new(config: AdaptiveThresholdConfig) -> Self {
179        Self { config }
180    }
181
182    /// Compute the optimal threshold for matching between two SBOMs.
183    #[must_use]
184    pub fn compute_threshold(
185        &self,
186        old_sbom: &NormalizedSbom,
187        new_sbom: &NormalizedSbom,
188        matcher: &FuzzyMatcher,
189    ) -> AdaptiveThresholdResult {
190        // Sample scores between components
191        let scores = self.sample_scores(old_sbom, new_sbom, matcher);
192
193        if scores.len() < self.config.min_samples {
194            // Not enough data, fall back to default
195            return AdaptiveThresholdResult {
196                threshold: matcher.config().threshold,
197                method: AdaptiveMethod::Default,
198                samples: scores.len(),
199                score_stats: ScoreStats::from_scores(&scores),
200                match_ratio: 0.0,
201                confidence: 0.0,
202            };
203        }
204
205        let stats = ScoreStats::from_scores(&scores);
206
207        // Choose method based on config
208        let (threshold, method) = self.config.target_match_ratio.map_or_else(
209            || {
210                let t = self.otsu_threshold(&scores);
211                (t, AdaptiveMethod::Otsu)
212            },
213            |target_ratio| {
214                let t = self.binary_search_threshold(&scores, target_ratio);
215                (t, AdaptiveMethod::TargetRatio)
216            },
217        );
218
219        // Clamp to configured bounds
220        let threshold = threshold.clamp(self.config.min_threshold, self.config.max_threshold);
221
222        // Compute match ratio at this threshold
223        let match_count = scores.iter().filter(|&&s| s >= threshold).count();
224        let match_ratio = match_count as f64 / scores.len() as f64;
225
226        // Estimate confidence based on sample size and score distribution
227        let confidence = self.estimate_confidence(&scores, &stats);
228
229        AdaptiveThresholdResult {
230            threshold,
231            method,
232            samples: scores.len(),
233            score_stats: stats,
234            match_ratio,
235            confidence,
236        }
237    }
238
239    /// Sample match scores between components of two SBOMs.
240    fn sample_scores(
241        &self,
242        old_sbom: &NormalizedSbom,
243        new_sbom: &NormalizedSbom,
244        matcher: &FuzzyMatcher,
245    ) -> Vec<f64> {
246        let mut scores = Vec::new();
247        let max_samples = 1000; // Limit for performance
248
249        let old_components: Vec<&Component> = old_sbom.components.values().collect();
250        let new_components: Vec<&Component> = new_sbom.components.values().collect();
251
252        // For each old component, find best match score in new SBOM
253        for old_comp in old_components.iter().take(max_samples) {
254            let best_score = new_components
255                .iter()
256                .map(|new_comp| matcher.match_score(old_comp, new_comp))
257                .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
258                .unwrap_or(0.0);
259
260            scores.push(best_score);
261        }
262
263        scores
264    }
265
266    /// Binary search for threshold that yields target match ratio.
267    fn binary_search_threshold(&self, scores: &[f64], target_ratio: f64) -> f64 {
268        let mut low = self.config.min_threshold;
269        let mut high = self.config.max_threshold;
270
271        for _ in 0..self.config.max_iterations {
272            let mid = f64::midpoint(low, high);
273            let match_count = scores.iter().filter(|&&s| s >= mid).count();
274            let ratio = match_count as f64 / scores.len() as f64;
275
276            if (ratio - target_ratio).abs() < self.config.precision {
277                return mid;
278            }
279
280            if ratio > target_ratio {
281                // Too many matches, increase threshold
282                low = mid;
283            } else {
284                // Too few matches, decrease threshold
285                high = mid;
286            }
287        }
288
289        f64::midpoint(low, high)
290    }
291
292    /// Otsu's method: find threshold that maximizes between-class variance.
293    fn otsu_threshold(&self, scores: &[f64]) -> f64 {
294        // Discretize scores into bins
295        let num_bins = 100;
296        let mut histogram = vec![0usize; num_bins];
297
298        for &score in scores {
299            let bin = ((score * (num_bins - 1) as f64) as usize).min(num_bins - 1);
300            histogram[bin] += 1;
301        }
302
303        let total = scores.len() as f64;
304        let mut sum_total = 0.0;
305        for (i, &count) in histogram.iter().enumerate() {
306            sum_total += i as f64 * count as f64;
307        }
308
309        let mut first_optimal_bin = 0;
310        let mut last_optimal_bin = 0;
311        let mut best_variance = 0.0;
312        let mut sum_background = 0.0;
313        let mut weight_background = 0.0;
314        let variance_tolerance = 1e-6; // Allow for floating-point comparison
315
316        for (i, &count) in histogram.iter().enumerate() {
317            weight_background += count as f64;
318            if weight_background == 0.0 {
319                continue;
320            }
321
322            let weight_foreground = total - weight_background;
323            if weight_foreground == 0.0 {
324                break;
325            }
326
327            sum_background += i as f64 * count as f64;
328            let mean_background = sum_background / weight_background;
329            let mean_foreground = (sum_total - sum_background) / weight_foreground;
330
331            let between_variance =
332                weight_background * weight_foreground * (mean_background - mean_foreground).powi(2);
333
334            if between_variance > best_variance + variance_tolerance {
335                // Found a new best - reset the range
336                best_variance = between_variance;
337                first_optimal_bin = i;
338                last_optimal_bin = i;
339            } else if (between_variance - best_variance).abs() <= variance_tolerance {
340                // Same variance (within tolerance) - extend the range
341                last_optimal_bin = i;
342            }
343        }
344
345        // Use the middle of the optimal range for better practical thresholds
346        // This helps when there's a gap between clusters (bimodal distribution)
347        let middle_bin = usize::midpoint(first_optimal_bin, last_optimal_bin);
348        (middle_bin as f64 + 0.5) / num_bins as f64
349    }
350
351    /// Estimate confidence in the computed threshold.
352    fn estimate_confidence(&self, scores: &[f64], stats: &ScoreStats) -> f64 {
353        // Factors that increase confidence:
354        // 1. More samples
355        // 2. Clear bimodal distribution (high std_dev)
356        // 3. Some exact matches (indicates correct matches exist)
357        // 4. Not all zero scores
358
359        let sample_factor = (scores.len() as f64 / 100.0).min(1.0);
360        let distribution_factor = (stats.std_dev * 3.0).min(1.0);
361        let exact_match_factor = if stats.exact_matches > 0 { 0.9 } else { 0.5 };
362        let zero_score_penalty = if stats.zero_scores == scores.len() {
363            0.0
364        } else {
365            1.0
366        };
367
368        (sample_factor * 0.3
369            + distribution_factor * 0.3
370            + exact_match_factor * 0.2
371            + zero_score_penalty * 0.2)
372            .clamp(0.0, 1.0)
373    }
374}
375
376impl Default for AdaptiveThreshold {
377    fn default() -> Self {
378        Self::new(AdaptiveThresholdConfig::default())
379    }
380}
381
382/// Extension trait for `FuzzyMatcher` to support adaptive thresholding.
383pub trait AdaptiveMatching {
384    /// Create a matcher with an adaptively computed threshold for the given SBOMs.
385    fn with_adaptive_threshold(
386        old_sbom: &NormalizedSbom,
387        new_sbom: &NormalizedSbom,
388        base_config: FuzzyMatchConfig,
389    ) -> (FuzzyMatcher, AdaptiveThresholdResult);
390
391    /// Compute and apply adaptive threshold to an existing matcher.
392    fn adapt_threshold(
393        &self,
394        old_sbom: &NormalizedSbom,
395        new_sbom: &NormalizedSbom,
396    ) -> AdaptiveThresholdResult;
397}
398
399impl AdaptiveMatching for FuzzyMatcher {
400    fn with_adaptive_threshold(
401        old_sbom: &NormalizedSbom,
402        new_sbom: &NormalizedSbom,
403        base_config: FuzzyMatchConfig,
404    ) -> (FuzzyMatcher, AdaptiveThresholdResult) {
405        let base_matcher = Self::new(base_config.clone());
406        let adjuster = AdaptiveThreshold::default();
407        let result = adjuster.compute_threshold(old_sbom, new_sbom, &base_matcher);
408
409        // Create new matcher with adapted threshold
410        let mut adapted_config = base_config;
411        adapted_config.threshold = result.threshold;
412        let adapted_matcher = Self::new(adapted_config);
413
414        (adapted_matcher, result)
415    }
416
417    fn adapt_threshold(
418        &self,
419        old_sbom: &NormalizedSbom,
420        new_sbom: &NormalizedSbom,
421    ) -> AdaptiveThresholdResult {
422        let adjuster = AdaptiveThreshold::default();
423        adjuster.compute_threshold(old_sbom, new_sbom, self)
424    }
425}
426
427#[cfg(test)]
428mod tests {
429    use super::*;
430    use crate::model::DocumentMetadata;
431
432    fn make_component(name: &str) -> Component {
433        Component::new(name.to_string(), format!("id-{}", name))
434    }
435
436    fn make_sbom_with_components(names: &[&str]) -> NormalizedSbom {
437        let mut sbom = NormalizedSbom::new(DocumentMetadata::default());
438        for name in names {
439            sbom.add_component(make_component(name));
440        }
441        sbom
442    }
443
444    #[test]
445    fn test_score_stats_computation() {
446        let scores = vec![0.0, 0.3, 0.5, 0.7, 1.0];
447        let stats = ScoreStats::from_scores(&scores);
448
449        assert_eq!(stats.min, 0.0);
450        assert_eq!(stats.max, 1.0);
451        assert!((stats.mean - 0.5).abs() < 0.01);
452        assert_eq!(stats.median, 0.5);
453        assert_eq!(stats.exact_matches, 1);
454        assert_eq!(stats.zero_scores, 1);
455    }
456
457    #[test]
458    fn test_score_stats_empty() {
459        let scores: Vec<f64> = vec![];
460        let stats = ScoreStats::from_scores(&scores);
461
462        assert_eq!(stats.min, 0.0);
463        assert_eq!(stats.max, 0.0);
464        assert_eq!(stats.mean, 0.0);
465    }
466
467    #[test]
468    fn test_adaptive_threshold_config() {
469        let config = AdaptiveThresholdConfig::for_target_ratio(0.7);
470        assert_eq!(config.target_match_ratio, Some(0.7));
471
472        let config = AdaptiveThresholdConfig::otsu();
473        assert!(config.target_match_ratio.is_none());
474    }
475
476    #[test]
477    fn test_adaptive_threshold_with_similar_sboms() {
478        let old = make_sbom_with_components(&[
479            "lodash", "react", "express", "axios", "moment", "webpack", "babel", "eslint",
480            "prettier", "jest",
481        ]);
482        let new = make_sbom_with_components(&[
483            "lodash", "react", "express", "axios", "dayjs", // moment -> dayjs
484            "webpack", "babel", "eslint", "prettier", "vitest", // jest -> vitest
485        ]);
486
487        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced());
488        let adjuster = AdaptiveThreshold::new(AdaptiveThresholdConfig::otsu());
489        let result = adjuster.compute_threshold(&old, &new, &matcher);
490
491        assert!(result.threshold >= 0.5 && result.threshold <= 0.99);
492        assert!(result.samples >= 10);
493        assert!(result.confidence > 0.0);
494    }
495
496    #[test]
497    fn test_adaptive_threshold_target_ratio() {
498        let old = make_sbom_with_components(&[
499            "package-a",
500            "package-b",
501            "package-c",
502            "package-d",
503            "package-e",
504            "package-f",
505            "package-g",
506            "package-h",
507            "package-i",
508            "package-j",
509        ]);
510        let new = make_sbom_with_components(&[
511            "package-a",
512            "package-b",
513            "package-c",
514            "different-1",
515            "different-2",
516            "different-3",
517            "different-4",
518            "different-5",
519            "different-6",
520            "different-7",
521        ]);
522
523        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced());
524        let config = AdaptiveThresholdConfig::for_target_ratio(0.3); // Target 30% matches
525        let adjuster = AdaptiveThreshold::new(config);
526        let result = adjuster.compute_threshold(&old, &new, &matcher);
527
528        assert_eq!(result.method, AdaptiveMethod::TargetRatio);
529    }
530
531    #[test]
532    fn test_adaptive_matching_trait() {
533        let old = make_sbom_with_components(&["lodash", "express", "react", "vue", "angular"]);
534        let new = make_sbom_with_components(&["lodash", "express", "react", "svelte", "solid"]);
535
536        let base_config = FuzzyMatchConfig::balanced();
537        let (adapted_matcher, result) =
538            FuzzyMatcher::with_adaptive_threshold(&old, &new, base_config);
539
540        // The adapted matcher should have the computed threshold
541        assert!((adapted_matcher.config().threshold - result.threshold).abs() < 0.001);
542    }
543
544    #[test]
545    fn test_binary_search_threshold() {
546        let scores: Vec<f64> = (0..100).map(|i| i as f64 / 100.0).collect();
547        let adjuster = AdaptiveThreshold::new(AdaptiveThresholdConfig::for_target_ratio(0.5));
548
549        let threshold = adjuster.binary_search_threshold(&scores, 0.5);
550
551        // Should find threshold around 0.5 where 50% of scores are >= threshold
552        assert!((threshold - 0.5).abs() < 0.1);
553    }
554
555    #[test]
556    fn test_otsu_threshold_bimodal() {
557        // Create bimodal distribution (two peaks) with deterministic values
558        // Using extreme separation to make Otsu clearly identify the gap
559        let mut scores = vec![0.1; 50];
560        // High scores: all exactly 0.9
561        scores.extend(std::iter::repeat_n(0.9, 50));
562
563        let adjuster = AdaptiveThreshold::default();
564        let threshold = adjuster.otsu_threshold(&scores);
565
566        // Otsu should find threshold between the two peaks (around 0.5)
567        // With perfectly bimodal data at 0.1 and 0.9, the optimal split is 0.5
568        assert!(
569            threshold > 0.2 && threshold < 0.8,
570            "Threshold {} should be between peaks (0.2-0.8)",
571            threshold
572        );
573    }
574}