Skip to main content

sbom_tools/matching/
config.rs

1//! Fuzzy matching configuration.
2
3use serde::{Deserialize, Serialize};
4
5/// Configuration for fuzzy matching behavior.
6#[derive(Debug, Clone, Serialize, Deserialize)]
7pub struct FuzzyMatchConfig {
8    /// Minimum confidence threshold (0.0 - 1.0)
9    pub threshold: f64,
10    /// Weight for Levenshtein distance component
11    pub levenshtein_weight: f64,
12    /// Weight for Jaro-Winkler similarity component
13    pub jaro_winkler_weight: f64,
14    /// Whether to use alias table lookups
15    pub use_aliases: bool,
16    /// Whether to use ecosystem-specific rules
17    pub use_ecosystem_rules: bool,
18    /// Maximum candidates to consider for fuzzy matching
19    pub max_candidates: usize,
20    /// Multi-field scoring weights (optional, enables multi-field matching when set)
21    #[serde(default)]
22    pub field_weights: Option<MultiFieldWeights>,
23}
24
25/// Weights for multi-field scoring.
26///
27/// All weights should sum to 1.0 for normalized scoring.
28/// Fields with weight 0.0 are ignored in matching.
29///
30/// Penalty fields (negative values) are applied on top of the weighted score
31/// to penalize mismatches more strongly.
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct MultiFieldWeights {
34    /// Weight for name similarity (primary field)
35    pub name: f64,
36    /// Weight for version match (exact match gives full score)
37    pub version: f64,
38    /// Weight for ecosystem match (exact match gives full score)
39    pub ecosystem: f64,
40    /// Weight for license overlap (Jaccard similarity of license sets)
41    pub licenses: f64,
42    /// Weight for supplier/publisher match
43    pub supplier: f64,
44    /// Weight for group/namespace match
45    pub group: f64,
46
47    // Penalty fields (applied on top of weighted score)
48
49    /// Penalty applied when ecosystems are different (negative value, e.g., -0.15)
50    #[serde(default)]
51    pub ecosystem_mismatch_penalty: f64,
52    /// Enable graduated version scoring based on semver distance
53    #[serde(default = "default_true")]
54    pub version_divergence_enabled: bool,
55    /// Penalty per major version difference (e.g., 0.10 = 10% per major)
56    #[serde(default = "default_version_major_penalty")]
57    pub version_major_penalty: f64,
58    /// Penalty per minor version difference, capped (e.g., 0.02 = 2% per minor)
59    #[serde(default = "default_version_minor_penalty")]
60    pub version_minor_penalty: f64,
61}
62
63const fn default_true() -> bool {
64    true
65}
66
67const fn default_version_major_penalty() -> f64 {
68    0.10
69}
70
71const fn default_version_minor_penalty() -> f64 {
72    0.02
73}
74
75impl MultiFieldWeights {
76    /// Default weights emphasizing name matching.
77    #[must_use] 
78    pub const fn name_focused() -> Self {
79        Self {
80            name: 0.80,
81            version: 0.05,
82            ecosystem: 0.10,
83            licenses: 0.03,
84            supplier: 0.01,
85            group: 0.01,
86            ecosystem_mismatch_penalty: -0.15,
87            version_divergence_enabled: true,
88            version_major_penalty: 0.10,
89            version_minor_penalty: 0.02,
90        }
91    }
92
93    /// Balanced weights across all fields.
94    #[must_use] 
95    pub const fn balanced() -> Self {
96        Self {
97            name: 0.60,
98            version: 0.10,
99            ecosystem: 0.15,
100            licenses: 0.08,
101            supplier: 0.04,
102            group: 0.03,
103            ecosystem_mismatch_penalty: -0.15, // Applied on top of weighted score
104            version_divergence_enabled: true,
105            version_major_penalty: 0.10,
106            version_minor_penalty: 0.02,
107        }
108    }
109
110    /// Weights for security-focused matching (emphasizes ecosystem and version).
111    #[must_use] 
112    pub const fn security_focused() -> Self {
113        Self {
114            name: 0.50,
115            version: 0.20,
116            ecosystem: 0.20,
117            licenses: 0.05,
118            supplier: 0.03,
119            group: 0.02,
120            ecosystem_mismatch_penalty: -0.25, // Stricter penalty
121            version_divergence_enabled: true,
122            version_major_penalty: 0.15, // Higher penalty for major version diff
123            version_minor_penalty: 0.03,
124        }
125    }
126
127    /// Legacy weights with no penalties (for backward compatibility).
128    ///
129    /// Use this preset when you want the old binary scoring behavior
130    /// without ecosystem mismatch penalties or version divergence scoring.
131    #[must_use] 
132    pub const fn legacy() -> Self {
133        Self {
134            name: 0.60,
135            version: 0.10,
136            ecosystem: 0.15,
137            licenses: 0.08,
138            supplier: 0.04,
139            group: 0.03,
140            ecosystem_mismatch_penalty: 0.0, // No penalty
141            version_divergence_enabled: false, // Binary scoring
142            version_major_penalty: 0.0,
143            version_minor_penalty: 0.0,
144        }
145    }
146
147    /// Check if weights are properly normalized (sum to ~1.0).
148    /// Note: Penalty fields are not included in normalization check.
149    #[must_use] 
150    pub fn is_normalized(&self) -> bool {
151        let sum =
152            self.name + self.version + self.ecosystem + self.licenses + self.supplier + self.group;
153        (sum - 1.0).abs() < 0.001
154    }
155
156    /// Normalize weights to sum to 1.0.
157    /// Note: Penalty fields are not affected by normalization.
158    pub fn normalize(&mut self) {
159        let sum =
160            self.name + self.version + self.ecosystem + self.licenses + self.supplier + self.group;
161        if sum > 0.0 {
162            self.name /= sum;
163            self.version /= sum;
164            self.ecosystem /= sum;
165            self.licenses /= sum;
166            self.supplier /= sum;
167            self.group /= sum;
168        }
169    }
170}
171
172impl Default for MultiFieldWeights {
173    fn default() -> Self {
174        Self::balanced()
175    }
176}
177
178impl FuzzyMatchConfig {
179    /// Strict matching for security-critical scenarios
180    #[must_use] 
181    pub const fn strict() -> Self {
182        Self {
183            threshold: 0.95,
184            levenshtein_weight: 0.5,
185            jaro_winkler_weight: 0.5,
186            use_aliases: true,
187            use_ecosystem_rules: true,
188            max_candidates: 100,
189            field_weights: None, // Single-field (name) matching by default
190        }
191    }
192
193    /// Balanced matching for general diff operations
194    #[must_use] 
195    pub const fn balanced() -> Self {
196        Self {
197            threshold: 0.85,
198            levenshtein_weight: 0.4,
199            jaro_winkler_weight: 0.6,
200            use_aliases: true,
201            use_ecosystem_rules: true,
202            max_candidates: 500,
203            field_weights: None, // Single-field (name) matching by default
204        }
205    }
206
207    /// Permissive matching for discovery/exploration
208    #[must_use] 
209    pub const fn permissive() -> Self {
210        Self {
211            threshold: 0.70,
212            levenshtein_weight: 0.3,
213            jaro_winkler_weight: 0.7,
214            use_aliases: true,
215            use_ecosystem_rules: true,
216            max_candidates: 1000,
217            field_weights: None, // Single-field (name) matching by default
218        }
219    }
220
221    /// Enable multi-field scoring with the given weights.
222    #[must_use]
223    pub const fn with_multi_field(mut self, weights: MultiFieldWeights) -> Self {
224        self.field_weights = Some(weights);
225        self
226    }
227
228    /// Set a custom threshold value.
229    #[must_use]
230    pub const fn with_threshold(mut self, threshold: f64) -> Self {
231        self.threshold = threshold;
232        self
233    }
234
235    /// Strict matching with multi-field scoring for security scenarios.
236    #[must_use] 
237    pub const fn strict_multi_field() -> Self {
238        Self::strict().with_multi_field(MultiFieldWeights::security_focused())
239    }
240
241    /// Balanced matching with multi-field scoring.
242    #[must_use] 
243    pub const fn balanced_multi_field() -> Self {
244        Self::balanced().with_multi_field(MultiFieldWeights::balanced())
245    }
246
247    /// Create config from a preset name.
248    ///
249    /// Supported presets:
250    /// - "strict", "balanced", "permissive" - single-field (name only)
251    /// - "strict-multi", "balanced-multi" - multi-field scoring enabled
252    #[must_use] 
253    pub fn from_preset(name: &str) -> Option<Self> {
254        match name.to_lowercase().as_str() {
255            "strict" => Some(Self::strict()),
256            "balanced" => Some(Self::balanced()),
257            "permissive" => Some(Self::permissive()),
258            "strict-multi" | "strict_multi" => Some(Self::strict_multi_field()),
259            "balanced-multi" | "balanced_multi" => Some(Self::balanced_multi_field()),
260            _ => None,
261        }
262    }
263}
264
265impl Default for FuzzyMatchConfig {
266    fn default() -> Self {
267        Self::balanced()
268    }
269}
270
271/// Configuration for cross-ecosystem matching.
272///
273/// Cross-ecosystem matching allows components to be matched across different
274/// package ecosystems (e.g., npm vs `PyPI`) when they represent the same
275/// underlying library. This is enabled by default with conservative settings.
276#[derive(Debug, Clone, Serialize, Deserialize)]
277pub struct CrossEcosystemConfig {
278    /// Whether cross-ecosystem matching is enabled
279    pub enabled: bool,
280    /// Minimum score required for cross-ecosystem matches
281    pub min_score: f64,
282    /// Score penalty applied to cross-ecosystem matches
283    pub score_penalty: f64,
284    /// Maximum number of cross-ecosystem candidates per component
285    pub max_candidates: usize,
286    /// Only use verified cross-ecosystem mappings (stricter)
287    pub verified_only: bool,
288}
289
290impl Default for CrossEcosystemConfig {
291    fn default() -> Self {
292        Self {
293            enabled: true,
294            min_score: 0.80,
295            score_penalty: 0.10,
296            max_candidates: 10,
297            verified_only: false,
298        }
299    }
300}
301
302impl CrossEcosystemConfig {
303    /// Disabled cross-ecosystem matching.
304    #[must_use] 
305    pub fn disabled() -> Self {
306        Self {
307            enabled: false,
308            ..Default::default()
309        }
310    }
311
312    /// Strict settings for high-confidence matches only.
313    #[must_use] 
314    pub const fn strict() -> Self {
315        Self {
316            enabled: true,
317            min_score: 0.90,
318            score_penalty: 0.15,
319            max_candidates: 5,
320            verified_only: true,
321        }
322    }
323
324    /// Permissive settings for discovery/exploration.
325    #[must_use] 
326    pub const fn permissive() -> Self {
327        Self {
328            enabled: true,
329            min_score: 0.70,
330            score_penalty: 0.05,
331            max_candidates: 20,
332            verified_only: false,
333        }
334    }
335}