Skip to main content

sbom_tools/matching/
config.rs

1//! Fuzzy matching configuration.
2
3use serde::{Deserialize, Serialize};
4
5/// Configuration for fuzzy matching behavior.
6#[derive(Debug, Clone, Serialize, Deserialize)]
7pub struct FuzzyMatchConfig {
8    /// Minimum confidence threshold (0.0 - 1.0)
9    pub threshold: f64,
10    /// Weight for Levenshtein distance component
11    pub levenshtein_weight: f64,
12    /// Weight for Jaro-Winkler similarity component
13    pub jaro_winkler_weight: f64,
14    /// Whether to use alias table lookups
15    pub use_aliases: bool,
16    /// Whether to use ecosystem-specific rules
17    pub use_ecosystem_rules: bool,
18    /// Maximum candidates to consider for fuzzy matching
19    pub max_candidates: usize,
20    /// Multi-field scoring weights (optional, enables multi-field matching when set)
21    #[serde(default)]
22    pub field_weights: Option<MultiFieldWeights>,
23}
24
25/// Weights for multi-field scoring.
26///
27/// All weights should sum to 1.0 for normalized scoring.
28/// Fields with weight 0.0 are ignored in matching.
29///
30/// Penalty fields (negative values) are applied on top of the weighted score
31/// to penalize mismatches more strongly.
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct MultiFieldWeights {
34    /// Weight for name similarity (primary field)
35    pub name: f64,
36    /// Weight for version match (exact match gives full score)
37    pub version: f64,
38    /// Weight for ecosystem match (exact match gives full score)
39    pub ecosystem: f64,
40    /// Weight for license overlap (Jaccard similarity of license sets)
41    pub licenses: f64,
42    /// Weight for supplier/publisher match
43    pub supplier: f64,
44    /// Weight for group/namespace match
45    pub group: f64,
46
47    // Penalty fields (applied on top of weighted score)
48
49    /// Penalty applied when ecosystems are different (negative value, e.g., -0.15)
50    #[serde(default)]
51    pub ecosystem_mismatch_penalty: f64,
52    /// Enable graduated version scoring based on semver distance
53    #[serde(default = "default_true")]
54    pub version_divergence_enabled: bool,
55    /// Penalty per major version difference (e.g., 0.10 = 10% per major)
56    #[serde(default = "default_version_major_penalty")]
57    pub version_major_penalty: f64,
58    /// Penalty per minor version difference, capped (e.g., 0.02 = 2% per minor)
59    #[serde(default = "default_version_minor_penalty")]
60    pub version_minor_penalty: f64,
61}
62
63fn default_true() -> bool {
64    true
65}
66
67fn default_version_major_penalty() -> f64 {
68    0.10
69}
70
71fn default_version_minor_penalty() -> f64 {
72    0.02
73}
74
75impl MultiFieldWeights {
76    /// Default weights emphasizing name matching.
77    pub fn name_focused() -> Self {
78        Self {
79            name: 0.80,
80            version: 0.05,
81            ecosystem: 0.10,
82            licenses: 0.03,
83            supplier: 0.01,
84            group: 0.01,
85            ecosystem_mismatch_penalty: -0.15,
86            version_divergence_enabled: true,
87            version_major_penalty: 0.10,
88            version_minor_penalty: 0.02,
89        }
90    }
91
92    /// Balanced weights across all fields.
93    pub fn balanced() -> Self {
94        Self {
95            name: 0.60,
96            version: 0.10,
97            ecosystem: 0.15,
98            licenses: 0.08,
99            supplier: 0.04,
100            group: 0.03,
101            ecosystem_mismatch_penalty: -0.15, // Applied on top of weighted score
102            version_divergence_enabled: true,
103            version_major_penalty: 0.10,
104            version_minor_penalty: 0.02,
105        }
106    }
107
108    /// Weights for security-focused matching (emphasizes ecosystem and version).
109    pub fn security_focused() -> Self {
110        Self {
111            name: 0.50,
112            version: 0.20,
113            ecosystem: 0.20,
114            licenses: 0.05,
115            supplier: 0.03,
116            group: 0.02,
117            ecosystem_mismatch_penalty: -0.25, // Stricter penalty
118            version_divergence_enabled: true,
119            version_major_penalty: 0.15, // Higher penalty for major version diff
120            version_minor_penalty: 0.03,
121        }
122    }
123
124    /// Legacy weights with no penalties (for backward compatibility).
125    ///
126    /// Use this preset when you want the old binary scoring behavior
127    /// without ecosystem mismatch penalties or version divergence scoring.
128    pub fn legacy() -> Self {
129        Self {
130            name: 0.60,
131            version: 0.10,
132            ecosystem: 0.15,
133            licenses: 0.08,
134            supplier: 0.04,
135            group: 0.03,
136            ecosystem_mismatch_penalty: 0.0, // No penalty
137            version_divergence_enabled: false, // Binary scoring
138            version_major_penalty: 0.0,
139            version_minor_penalty: 0.0,
140        }
141    }
142
143    /// Check if weights are properly normalized (sum to ~1.0).
144    /// Note: Penalty fields are not included in normalization check.
145    pub fn is_normalized(&self) -> bool {
146        let sum =
147            self.name + self.version + self.ecosystem + self.licenses + self.supplier + self.group;
148        (sum - 1.0).abs() < 0.001
149    }
150
151    /// Normalize weights to sum to 1.0.
152    /// Note: Penalty fields are not affected by normalization.
153    pub fn normalize(&mut self) {
154        let sum =
155            self.name + self.version + self.ecosystem + self.licenses + self.supplier + self.group;
156        if sum > 0.0 {
157            self.name /= sum;
158            self.version /= sum;
159            self.ecosystem /= sum;
160            self.licenses /= sum;
161            self.supplier /= sum;
162            self.group /= sum;
163        }
164    }
165}
166
167impl Default for MultiFieldWeights {
168    fn default() -> Self {
169        Self::balanced()
170    }
171}
172
173impl FuzzyMatchConfig {
174    /// Strict matching for security-critical scenarios
175    pub fn strict() -> Self {
176        Self {
177            threshold: 0.95,
178            levenshtein_weight: 0.5,
179            jaro_winkler_weight: 0.5,
180            use_aliases: true,
181            use_ecosystem_rules: true,
182            max_candidates: 100,
183            field_weights: None, // Single-field (name) matching by default
184        }
185    }
186
187    /// Balanced matching for general diff operations
188    pub fn balanced() -> Self {
189        Self {
190            threshold: 0.85,
191            levenshtein_weight: 0.4,
192            jaro_winkler_weight: 0.6,
193            use_aliases: true,
194            use_ecosystem_rules: true,
195            max_candidates: 500,
196            field_weights: None, // Single-field (name) matching by default
197        }
198    }
199
200    /// Permissive matching for discovery/exploration
201    pub fn permissive() -> Self {
202        Self {
203            threshold: 0.70,
204            levenshtein_weight: 0.3,
205            jaro_winkler_weight: 0.7,
206            use_aliases: true,
207            use_ecosystem_rules: true,
208            max_candidates: 1000,
209            field_weights: None, // Single-field (name) matching by default
210        }
211    }
212
213    /// Enable multi-field scoring with the given weights.
214    pub fn with_multi_field(mut self, weights: MultiFieldWeights) -> Self {
215        self.field_weights = Some(weights);
216        self
217    }
218
219    /// Set a custom threshold value.
220    pub fn with_threshold(mut self, threshold: f64) -> Self {
221        self.threshold = threshold;
222        self
223    }
224
225    /// Strict matching with multi-field scoring for security scenarios.
226    pub fn strict_multi_field() -> Self {
227        Self::strict().with_multi_field(MultiFieldWeights::security_focused())
228    }
229
230    /// Balanced matching with multi-field scoring.
231    pub fn balanced_multi_field() -> Self {
232        Self::balanced().with_multi_field(MultiFieldWeights::balanced())
233    }
234
235    /// Create config from a preset name.
236    ///
237    /// Supported presets:
238    /// - "strict", "balanced", "permissive" - single-field (name only)
239    /// - "strict-multi", "balanced-multi" - multi-field scoring enabled
240    pub fn from_preset(name: &str) -> Option<Self> {
241        match name.to_lowercase().as_str() {
242            "strict" => Some(Self::strict()),
243            "balanced" => Some(Self::balanced()),
244            "permissive" => Some(Self::permissive()),
245            "strict-multi" | "strict_multi" => Some(Self::strict_multi_field()),
246            "balanced-multi" | "balanced_multi" => Some(Self::balanced_multi_field()),
247            _ => None,
248        }
249    }
250}
251
252impl Default for FuzzyMatchConfig {
253    fn default() -> Self {
254        Self::balanced()
255    }
256}
257
258/// Configuration for cross-ecosystem matching.
259///
260/// Cross-ecosystem matching allows components to be matched across different
261/// package ecosystems (e.g., npm vs PyPI) when they represent the same
262/// underlying library. This is enabled by default with conservative settings.
263#[derive(Debug, Clone, Serialize, Deserialize)]
264pub struct CrossEcosystemConfig {
265    /// Whether cross-ecosystem matching is enabled
266    pub enabled: bool,
267    /// Minimum score required for cross-ecosystem matches
268    pub min_score: f64,
269    /// Score penalty applied to cross-ecosystem matches
270    pub score_penalty: f64,
271    /// Maximum number of cross-ecosystem candidates per component
272    pub max_candidates: usize,
273    /// Only use verified cross-ecosystem mappings (stricter)
274    pub verified_only: bool,
275}
276
277impl Default for CrossEcosystemConfig {
278    fn default() -> Self {
279        Self {
280            enabled: true,
281            min_score: 0.80,
282            score_penalty: 0.10,
283            max_candidates: 10,
284            verified_only: false,
285        }
286    }
287}
288
289impl CrossEcosystemConfig {
290    /// Disabled cross-ecosystem matching.
291    pub fn disabled() -> Self {
292        Self {
293            enabled: false,
294            ..Default::default()
295        }
296    }
297
298    /// Strict settings for high-confidence matches only.
299    pub fn strict() -> Self {
300        Self {
301            enabled: true,
302            min_score: 0.90,
303            score_penalty: 0.15,
304            max_candidates: 5,
305            verified_only: true,
306        }
307    }
308
309    /// Permissive settings for discovery/exploration.
310    pub fn permissive() -> Self {
311        Self {
312            enabled: true,
313            min_score: 0.70,
314            score_penalty: 0.05,
315            max_candidates: 20,
316            verified_only: false,
317        }
318    }
319}