use crate::matching::{ComponentMatcher, FuzzyMatchConfig, FuzzyMatcher};
use crate::model::{Component, NormalizedSbom};
#[derive(Debug, Clone)]
pub struct AdaptiveThresholdConfig {
pub min_threshold: f64,
pub max_threshold: f64,
pub max_iterations: usize,
pub target_match_ratio: Option<f64>,
pub min_samples: usize,
pub precision: f64,
}
impl Default for AdaptiveThresholdConfig {
fn default() -> Self {
Self {
min_threshold: 0.50,
max_threshold: 0.99,
max_iterations: 20,
target_match_ratio: None, min_samples: 10,
precision: 0.01,
}
}
}
impl AdaptiveThresholdConfig {
#[must_use]
pub fn for_target_ratio(ratio: f64) -> Self {
Self {
target_match_ratio: Some(ratio.clamp(0.0, 1.0)),
..Default::default()
}
}
#[must_use]
pub fn otsu() -> Self {
Self {
target_match_ratio: None,
..Default::default()
}
}
}
#[derive(Debug, Clone)]
pub struct AdaptiveThresholdResult {
pub threshold: f64,
pub method: AdaptiveMethod,
pub samples: usize,
pub score_stats: ScoreStats,
pub match_ratio: f64,
pub confidence: f64,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[non_exhaustive]
pub enum AdaptiveMethod {
TargetRatio,
Otsu,
Default,
}
#[derive(Debug, Clone)]
pub struct ScoreStats {
pub min: f64,
pub max: f64,
pub mean: f64,
pub std_dev: f64,
pub median: f64,
pub exact_matches: usize,
pub zero_scores: usize,
}
impl ScoreStats {
#[must_use]
pub fn from_scores(scores: &[f64]) -> Self {
if scores.is_empty() {
return Self {
min: 0.0,
max: 0.0,
mean: 0.0,
std_dev: 0.0,
median: 0.0,
exact_matches: 0,
zero_scores: 0,
};
}
let mut sorted = scores.to_vec();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
let min = sorted[0];
let max = sorted[sorted.len() - 1];
let mean = scores.iter().sum::<f64>() / scores.len() as f64;
let median = if sorted.len().is_multiple_of(2) {
f64::midpoint(sorted[sorted.len() / 2 - 1], sorted[sorted.len() / 2])
} else {
sorted[sorted.len() / 2]
};
let variance =
scores.iter().map(|&s| (s - mean).powi(2)).sum::<f64>() / scores.len() as f64;
let std_dev = variance.sqrt();
let exact_matches = scores.iter().filter(|&&s| s >= 0.9999).count();
let zero_scores = scores.iter().filter(|&&s| s < 0.0001).count();
Self {
min,
max,
mean,
std_dev,
median,
exact_matches,
zero_scores,
}
}
}
pub struct AdaptiveThreshold {
config: AdaptiveThresholdConfig,
}
impl AdaptiveThreshold {
#[must_use]
pub const fn new(config: AdaptiveThresholdConfig) -> Self {
Self { config }
}
#[must_use]
pub fn compute_threshold(
&self,
old_sbom: &NormalizedSbom,
new_sbom: &NormalizedSbom,
matcher: &FuzzyMatcher,
) -> AdaptiveThresholdResult {
let scores = self.sample_scores(old_sbom, new_sbom, matcher);
if scores.len() < self.config.min_samples {
return AdaptiveThresholdResult {
threshold: matcher.config().threshold,
method: AdaptiveMethod::Default,
samples: scores.len(),
score_stats: ScoreStats::from_scores(&scores),
match_ratio: 0.0,
confidence: 0.0,
};
}
let stats = ScoreStats::from_scores(&scores);
let (threshold, method) = self.config.target_match_ratio.map_or_else(
|| {
let t = self.otsu_threshold(&scores);
(t, AdaptiveMethod::Otsu)
},
|target_ratio| {
let t = self.binary_search_threshold(&scores, target_ratio);
(t, AdaptiveMethod::TargetRatio)
},
);
let threshold = threshold.clamp(self.config.min_threshold, self.config.max_threshold);
let match_count = scores.iter().filter(|&&s| s >= threshold).count();
let match_ratio = match_count as f64 / scores.len() as f64;
let confidence = self.estimate_confidence(&scores, &stats);
AdaptiveThresholdResult {
threshold,
method,
samples: scores.len(),
score_stats: stats,
match_ratio,
confidence,
}
}
fn sample_scores(
&self,
old_sbom: &NormalizedSbom,
new_sbom: &NormalizedSbom,
matcher: &FuzzyMatcher,
) -> Vec<f64> {
let mut scores = Vec::new();
let max_samples = 1000;
let old_components: Vec<&Component> = old_sbom.components.values().collect();
let new_components: Vec<&Component> = new_sbom.components.values().collect();
for old_comp in old_components.iter().take(max_samples) {
let best_score = new_components
.iter()
.map(|new_comp| matcher.match_score(old_comp, new_comp))
.max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
.unwrap_or(0.0);
scores.push(best_score);
}
scores
}
fn binary_search_threshold(&self, scores: &[f64], target_ratio: f64) -> f64 {
let mut low = self.config.min_threshold;
let mut high = self.config.max_threshold;
for _ in 0..self.config.max_iterations {
let mid = f64::midpoint(low, high);
let match_count = scores.iter().filter(|&&s| s >= mid).count();
let ratio = match_count as f64 / scores.len() as f64;
if (ratio - target_ratio).abs() < self.config.precision {
return mid;
}
if ratio > target_ratio {
low = mid;
} else {
high = mid;
}
}
f64::midpoint(low, high)
}
fn otsu_threshold(&self, scores: &[f64]) -> f64 {
let num_bins = 100;
let mut histogram = vec![0usize; num_bins];
for &score in scores {
let bin = ((score * (num_bins - 1) as f64) as usize).min(num_bins - 1);
histogram[bin] += 1;
}
let total = scores.len() as f64;
let mut sum_total = 0.0;
for (i, &count) in histogram.iter().enumerate() {
sum_total += i as f64 * count as f64;
}
let mut first_optimal_bin = 0;
let mut last_optimal_bin = 0;
let mut best_variance = 0.0;
let mut sum_background = 0.0;
let mut weight_background = 0.0;
let variance_tolerance = 1e-6;
for (i, &count) in histogram.iter().enumerate() {
weight_background += count as f64;
if weight_background == 0.0 {
continue;
}
let weight_foreground = total - weight_background;
if weight_foreground == 0.0 {
break;
}
sum_background += i as f64 * count as f64;
let mean_background = sum_background / weight_background;
let mean_foreground = (sum_total - sum_background) / weight_foreground;
let between_variance =
weight_background * weight_foreground * (mean_background - mean_foreground).powi(2);
if between_variance > best_variance + variance_tolerance {
best_variance = between_variance;
first_optimal_bin = i;
last_optimal_bin = i;
} else if (between_variance - best_variance).abs() <= variance_tolerance {
last_optimal_bin = i;
}
}
let middle_bin = usize::midpoint(first_optimal_bin, last_optimal_bin);
(middle_bin as f64 + 0.5) / num_bins as f64
}
fn estimate_confidence(&self, scores: &[f64], stats: &ScoreStats) -> f64 {
let sample_factor = (scores.len() as f64 / 100.0).min(1.0);
let distribution_factor = (stats.std_dev * 3.0).min(1.0);
let exact_match_factor = if stats.exact_matches > 0 { 0.9 } else { 0.5 };
let zero_score_penalty = if stats.zero_scores == scores.len() {
0.0
} else {
1.0
};
(sample_factor * 0.3
+ distribution_factor * 0.3
+ exact_match_factor * 0.2
+ zero_score_penalty * 0.2)
.clamp(0.0, 1.0)
}
}
impl Default for AdaptiveThreshold {
fn default() -> Self {
Self::new(AdaptiveThresholdConfig::default())
}
}
pub trait AdaptiveMatching {
fn with_adaptive_threshold(
old_sbom: &NormalizedSbom,
new_sbom: &NormalizedSbom,
base_config: FuzzyMatchConfig,
) -> (FuzzyMatcher, AdaptiveThresholdResult);
fn adapt_threshold(
&self,
old_sbom: &NormalizedSbom,
new_sbom: &NormalizedSbom,
) -> AdaptiveThresholdResult;
}
impl AdaptiveMatching for FuzzyMatcher {
fn with_adaptive_threshold(
old_sbom: &NormalizedSbom,
new_sbom: &NormalizedSbom,
base_config: FuzzyMatchConfig,
) -> (FuzzyMatcher, AdaptiveThresholdResult) {
let base_matcher = Self::new(base_config.clone());
let adjuster = AdaptiveThreshold::default();
let result = adjuster.compute_threshold(old_sbom, new_sbom, &base_matcher);
let mut adapted_config = base_config;
adapted_config.threshold = result.threshold;
let adapted_matcher = Self::new(adapted_config);
(adapted_matcher, result)
}
fn adapt_threshold(
&self,
old_sbom: &NormalizedSbom,
new_sbom: &NormalizedSbom,
) -> AdaptiveThresholdResult {
let adjuster = AdaptiveThreshold::default();
adjuster.compute_threshold(old_sbom, new_sbom, self)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::model::DocumentMetadata;
fn make_component(name: &str) -> Component {
Component::new(name.to_string(), format!("id-{}", name))
}
fn make_sbom_with_components(names: &[&str]) -> NormalizedSbom {
let mut sbom = NormalizedSbom::new(DocumentMetadata::default());
for name in names {
sbom.add_component(make_component(name));
}
sbom
}
#[test]
fn test_score_stats_computation() {
let scores = vec![0.0, 0.3, 0.5, 0.7, 1.0];
let stats = ScoreStats::from_scores(&scores);
assert_eq!(stats.min, 0.0);
assert_eq!(stats.max, 1.0);
assert!((stats.mean - 0.5).abs() < 0.01);
assert_eq!(stats.median, 0.5);
assert_eq!(stats.exact_matches, 1);
assert_eq!(stats.zero_scores, 1);
}
#[test]
fn test_score_stats_empty() {
let scores: Vec<f64> = vec![];
let stats = ScoreStats::from_scores(&scores);
assert_eq!(stats.min, 0.0);
assert_eq!(stats.max, 0.0);
assert_eq!(stats.mean, 0.0);
}
#[test]
fn test_adaptive_threshold_config() {
let config = AdaptiveThresholdConfig::for_target_ratio(0.7);
assert_eq!(config.target_match_ratio, Some(0.7));
let config = AdaptiveThresholdConfig::otsu();
assert!(config.target_match_ratio.is_none());
}
#[test]
fn test_adaptive_threshold_with_similar_sboms() {
let old = make_sbom_with_components(&[
"lodash", "react", "express", "axios", "moment", "webpack", "babel", "eslint",
"prettier", "jest",
]);
let new = make_sbom_with_components(&[
"lodash", "react", "express", "axios", "dayjs", "webpack", "babel", "eslint", "prettier", "vitest", ]);
let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced());
let adjuster = AdaptiveThreshold::new(AdaptiveThresholdConfig::otsu());
let result = adjuster.compute_threshold(&old, &new, &matcher);
assert!(result.threshold >= 0.5 && result.threshold <= 0.99);
assert!(result.samples >= 10);
assert!(result.confidence > 0.0);
}
#[test]
fn test_adaptive_threshold_target_ratio() {
let old = make_sbom_with_components(&[
"package-a",
"package-b",
"package-c",
"package-d",
"package-e",
"package-f",
"package-g",
"package-h",
"package-i",
"package-j",
]);
let new = make_sbom_with_components(&[
"package-a",
"package-b",
"package-c",
"different-1",
"different-2",
"different-3",
"different-4",
"different-5",
"different-6",
"different-7",
]);
let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced());
let config = AdaptiveThresholdConfig::for_target_ratio(0.3); let adjuster = AdaptiveThreshold::new(config);
let result = adjuster.compute_threshold(&old, &new, &matcher);
assert_eq!(result.method, AdaptiveMethod::TargetRatio);
}
#[test]
fn test_adaptive_matching_trait() {
let old = make_sbom_with_components(&["lodash", "express", "react", "vue", "angular"]);
let new = make_sbom_with_components(&["lodash", "express", "react", "svelte", "solid"]);
let base_config = FuzzyMatchConfig::balanced();
let (adapted_matcher, result) =
FuzzyMatcher::with_adaptive_threshold(&old, &new, base_config);
assert!((adapted_matcher.config().threshold - result.threshold).abs() < 0.001);
}
#[test]
fn test_binary_search_threshold() {
let scores: Vec<f64> = (0..100).map(|i| i as f64 / 100.0).collect();
let adjuster = AdaptiveThreshold::new(AdaptiveThresholdConfig::for_target_ratio(0.5));
let threshold = adjuster.binary_search_threshold(&scores, 0.5);
assert!((threshold - 0.5).abs() < 0.1);
}
#[test]
fn test_otsu_threshold_bimodal() {
let mut scores = vec![0.1; 50];
scores.extend(std::iter::repeat_n(0.9, 50));
let adjuster = AdaptiveThreshold::default();
let threshold = adjuster.otsu_threshold(&scores);
assert!(
threshold > 0.2 && threshold < 0.8,
"Threshold {} should be between peaks (0.2-0.8)",
threshold
);
}
}