Skip to main content

sbom_tools/matching/
config.rs

1//! Fuzzy matching configuration.
2
3use serde::{Deserialize, Serialize};
4
5/// Configuration for fuzzy matching behavior.
6#[derive(Debug, Clone, Serialize, Deserialize)]
7pub struct FuzzyMatchConfig {
8    /// Minimum confidence threshold (0.0 - 1.0)
9    pub threshold: f64,
10    /// Weight for Levenshtein distance component
11    pub levenshtein_weight: f64,
12    /// Weight for Jaro-Winkler similarity component
13    pub jaro_winkler_weight: f64,
14    /// Whether to use alias table lookups
15    pub use_aliases: bool,
16    /// Whether to use ecosystem-specific rules
17    pub use_ecosystem_rules: bool,
18    /// Maximum candidates to consider for fuzzy matching
19    pub max_candidates: usize,
20    /// Multi-field scoring weights (optional, enables multi-field matching when set)
21    #[serde(default)]
22    pub field_weights: Option<MultiFieldWeights>,
23}
24
25/// Weights for multi-field scoring.
26///
27/// All weights should sum to 1.0 for normalized scoring.
28/// Fields with weight 0.0 are ignored in matching.
29///
30/// Penalty fields (negative values) are applied on top of the weighted score
31/// to penalize mismatches more strongly.
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct MultiFieldWeights {
34    /// Weight for name similarity (primary field)
35    pub name: f64,
36    /// Weight for version match (exact match gives full score)
37    pub version: f64,
38    /// Weight for ecosystem match (exact match gives full score)
39    pub ecosystem: f64,
40    /// Weight for license overlap (Jaccard similarity of license sets)
41    pub licenses: f64,
42    /// Weight for supplier/publisher match
43    pub supplier: f64,
44    /// Weight for group/namespace match
45    pub group: f64,
46
47    // Penalty fields (applied on top of weighted score)
48    /// Penalty applied when ecosystems are different (negative value, e.g., -0.15)
49    #[serde(default)]
50    pub ecosystem_mismatch_penalty: f64,
51    /// Enable graduated version scoring based on semver distance
52    #[serde(default = "default_true")]
53    pub version_divergence_enabled: bool,
54    /// Penalty per major version difference (e.g., 0.10 = 10% per major)
55    #[serde(default = "default_version_major_penalty")]
56    pub version_major_penalty: f64,
57    /// Penalty per minor version difference, capped (e.g., 0.02 = 2% per minor)
58    #[serde(default = "default_version_minor_penalty")]
59    pub version_minor_penalty: f64,
60}
61
62const fn default_true() -> bool {
63    true
64}
65
66const fn default_version_major_penalty() -> f64 {
67    0.10
68}
69
70const fn default_version_minor_penalty() -> f64 {
71    0.02
72}
73
74impl MultiFieldWeights {
75    /// Default weights emphasizing name matching.
76    #[must_use]
77    pub const fn name_focused() -> Self {
78        Self {
79            name: 0.80,
80            version: 0.05,
81            ecosystem: 0.10,
82            licenses: 0.03,
83            supplier: 0.01,
84            group: 0.01,
85            ecosystem_mismatch_penalty: -0.15,
86            version_divergence_enabled: true,
87            version_major_penalty: 0.10,
88            version_minor_penalty: 0.02,
89        }
90    }
91
92    /// Balanced weights across all fields.
93    #[must_use]
94    pub const fn balanced() -> Self {
95        Self {
96            name: 0.60,
97            version: 0.10,
98            ecosystem: 0.15,
99            licenses: 0.08,
100            supplier: 0.04,
101            group: 0.03,
102            ecosystem_mismatch_penalty: -0.15, // Applied on top of weighted score
103            version_divergence_enabled: true,
104            version_major_penalty: 0.10,
105            version_minor_penalty: 0.02,
106        }
107    }
108
109    /// Weights for security-focused matching (emphasizes ecosystem and version).
110    #[must_use]
111    pub const fn security_focused() -> Self {
112        Self {
113            name: 0.50,
114            version: 0.20,
115            ecosystem: 0.20,
116            licenses: 0.05,
117            supplier: 0.03,
118            group: 0.02,
119            ecosystem_mismatch_penalty: -0.25, // Stricter penalty
120            version_divergence_enabled: true,
121            version_major_penalty: 0.15, // Higher penalty for major version diff
122            version_minor_penalty: 0.03,
123        }
124    }
125
126    /// Legacy weights with no penalties (for backward compatibility).
127    ///
128    /// Use this preset when you want the old binary scoring behavior
129    /// without ecosystem mismatch penalties or version divergence scoring.
130    #[must_use]
131    pub const fn legacy() -> Self {
132        Self {
133            name: 0.60,
134            version: 0.10,
135            ecosystem: 0.15,
136            licenses: 0.08,
137            supplier: 0.04,
138            group: 0.03,
139            ecosystem_mismatch_penalty: 0.0,   // No penalty
140            version_divergence_enabled: false, // Binary scoring
141            version_major_penalty: 0.0,
142            version_minor_penalty: 0.0,
143        }
144    }
145
146    /// Check if weights are properly normalized (sum to ~1.0).
147    /// Note: Penalty fields are not included in normalization check.
148    #[must_use]
149    pub fn is_normalized(&self) -> bool {
150        let sum =
151            self.name + self.version + self.ecosystem + self.licenses + self.supplier + self.group;
152        (sum - 1.0).abs() < 0.001
153    }
154
155    /// Normalize weights to sum to 1.0.
156    /// Note: Penalty fields are not affected by normalization.
157    pub fn normalize(&mut self) {
158        let sum =
159            self.name + self.version + self.ecosystem + self.licenses + self.supplier + self.group;
160        if sum > 0.0 {
161            self.name /= sum;
162            self.version /= sum;
163            self.ecosystem /= sum;
164            self.licenses /= sum;
165            self.supplier /= sum;
166            self.group /= sum;
167        }
168    }
169}
170
171impl Default for MultiFieldWeights {
172    fn default() -> Self {
173        Self::balanced()
174    }
175}
176
177impl FuzzyMatchConfig {
178    /// Strict matching for security-critical scenarios
179    #[must_use]
180    pub const fn strict() -> Self {
181        Self {
182            threshold: 0.95,
183            levenshtein_weight: 0.5,
184            jaro_winkler_weight: 0.5,
185            use_aliases: true,
186            use_ecosystem_rules: true,
187            max_candidates: 100,
188            field_weights: None, // Single-field (name) matching by default
189        }
190    }
191
192    /// Balanced matching for general diff operations
193    #[must_use]
194    pub const fn balanced() -> Self {
195        Self {
196            threshold: 0.85,
197            levenshtein_weight: 0.4,
198            jaro_winkler_weight: 0.6,
199            use_aliases: true,
200            use_ecosystem_rules: true,
201            max_candidates: 500,
202            field_weights: None, // Single-field (name) matching by default
203        }
204    }
205
206    /// Permissive matching for discovery/exploration
207    #[must_use]
208    pub const fn permissive() -> Self {
209        Self {
210            threshold: 0.70,
211            levenshtein_weight: 0.3,
212            jaro_winkler_weight: 0.7,
213            use_aliases: true,
214            use_ecosystem_rules: true,
215            max_candidates: 1000,
216            field_weights: None, // Single-field (name) matching by default
217        }
218    }
219
220    /// Enable multi-field scoring with the given weights.
221    #[must_use]
222    pub const fn with_multi_field(mut self, weights: MultiFieldWeights) -> Self {
223        self.field_weights = Some(weights);
224        self
225    }
226
227    /// Set a custom threshold value.
228    #[must_use]
229    pub const fn with_threshold(mut self, threshold: f64) -> Self {
230        self.threshold = threshold;
231        self
232    }
233
234    /// Strict matching with multi-field scoring for security scenarios.
235    #[must_use]
236    pub const fn strict_multi_field() -> Self {
237        Self::strict().with_multi_field(MultiFieldWeights::security_focused())
238    }
239
240    /// Balanced matching with multi-field scoring.
241    #[must_use]
242    pub const fn balanced_multi_field() -> Self {
243        Self::balanced().with_multi_field(MultiFieldWeights::balanced())
244    }
245
246    /// Create config from a preset name.
247    ///
248    /// Supported presets:
249    /// - "strict", "balanced", "permissive" - single-field (name only)
250    /// - "strict-multi", "balanced-multi" - multi-field scoring enabled
251    #[must_use]
252    pub fn from_preset(name: &str) -> Option<Self> {
253        match name.to_lowercase().as_str() {
254            "strict" => Some(Self::strict()),
255            "balanced" => Some(Self::balanced()),
256            "permissive" => Some(Self::permissive()),
257            "strict-multi" | "strict_multi" => Some(Self::strict_multi_field()),
258            "balanced-multi" | "balanced_multi" => Some(Self::balanced_multi_field()),
259            _ => None,
260        }
261    }
262}
263
264impl Default for FuzzyMatchConfig {
265    fn default() -> Self {
266        Self::balanced()
267    }
268}
269
270/// Configuration for cross-ecosystem matching.
271///
272/// Cross-ecosystem matching allows components to be matched across different
273/// package ecosystems (e.g., npm vs `PyPI`) when they represent the same
274/// underlying library. This is enabled by default with conservative settings.
275#[derive(Debug, Clone, Serialize, Deserialize)]
276pub struct CrossEcosystemConfig {
277    /// Whether cross-ecosystem matching is enabled
278    pub enabled: bool,
279    /// Minimum score required for cross-ecosystem matches
280    pub min_score: f64,
281    /// Score penalty applied to cross-ecosystem matches
282    pub score_penalty: f64,
283    /// Maximum number of cross-ecosystem candidates per component
284    pub max_candidates: usize,
285    /// Only use verified cross-ecosystem mappings (stricter)
286    pub verified_only: bool,
287}
288
289impl Default for CrossEcosystemConfig {
290    fn default() -> Self {
291        Self {
292            enabled: true,
293            min_score: 0.80,
294            score_penalty: 0.10,
295            max_candidates: 10,
296            verified_only: false,
297        }
298    }
299}
300
301impl CrossEcosystemConfig {
302    /// Disabled cross-ecosystem matching.
303    #[must_use]
304    pub fn disabled() -> Self {
305        Self {
306            enabled: false,
307            ..Default::default()
308        }
309    }
310
311    /// Strict settings for high-confidence matches only.
312    #[must_use]
313    pub const fn strict() -> Self {
314        Self {
315            enabled: true,
316            min_score: 0.90,
317            score_penalty: 0.15,
318            max_candidates: 5,
319            verified_only: true,
320        }
321    }
322
323    /// Permissive settings for discovery/exploration.
324    #[must_use]
325    pub const fn permissive() -> Self {
326        Self {
327            enabled: true,
328            min_score: 0.70,
329            score_penalty: 0.05,
330            max_candidates: 20,
331            verified_only: false,
332        }
333    }
334}