Skip to main content

sbom_tools/parsers/
detection.rs

1//! Centralized format detection for SBOM parsers.
2//!
3//! This module provides consistent format detection logic used by both
4//! the standard parser and streaming parser, ensuring aligned confidence
5//! thresholds and detection behavior.
6
7use super::traits::{FormatConfidence, FormatDetection, ParseError, SbomParser};
8use super::{CycloneDxParser, SpdxParser};
9use crate::model::NormalizedSbom;
10use std::io::BufRead;
11
12/// Minimum confidence threshold for accepting a format detection.
13/// This is LOW confidence (0.25) - the parser believes it might be able to handle the content.
14pub const MIN_CONFIDENCE_THRESHOLD: f32 = 0.25;
15
16/// Parser type identified during detection.
17#[derive(Debug, Clone, Copy, PartialEq, Eq)]
18pub enum ParserKind {
19    CycloneDx,
20    Spdx,
21}
22
23impl ParserKind {
24    /// Get the human-readable name for this parser.
25    #[must_use]
26    pub const fn name(&self) -> &'static str {
27        match self {
28            Self::CycloneDx => "CycloneDX",
29            Self::Spdx => "SPDX",
30        }
31    }
32}
33
34/// Result of format detection.
35#[derive(Debug, Clone)]
36pub struct DetectionResult {
37    /// The parser that should handle this content, if detected.
38    pub parser: Option<ParserKind>,
39    /// Confidence level of the detection.
40    pub confidence: FormatConfidence,
41    /// Detected format variant (e.g., "JSON", "XML", "tag-value").
42    pub variant: Option<String>,
43    /// Detected version if available.
44    pub version: Option<String>,
45    /// Any warnings about the detection.
46    pub warnings: Vec<String>,
47}
48
49impl DetectionResult {
50    /// Create a result indicating no format was detected.
51    #[must_use]
52    pub fn unknown(reason: &str) -> Self {
53        Self {
54            parser: None,
55            confidence: FormatConfidence::NONE,
56            variant: None,
57            version: None,
58            warnings: vec![reason.to_string()],
59        }
60    }
61
62    /// Create a result for `CycloneDX` detection.
63    #[must_use]
64    pub fn cyclonedx(detection: FormatDetection) -> Self {
65        Self {
66            parser: Some(ParserKind::CycloneDx),
67            confidence: detection.confidence,
68            variant: detection.variant,
69            version: detection.version,
70            warnings: detection.warnings,
71        }
72    }
73
74    /// Create a result for SPDX detection.
75    #[must_use]
76    pub fn spdx(detection: FormatDetection) -> Self {
77        Self {
78            parser: Some(ParserKind::Spdx),
79            confidence: detection.confidence,
80            variant: detection.variant,
81            version: detection.version,
82            warnings: detection.warnings,
83        }
84    }
85
86    /// Check if the detection is confident enough to parse.
87    #[must_use]
88    pub fn can_parse(&self) -> bool {
89        self.parser.is_some() && self.confidence.value() >= MIN_CONFIDENCE_THRESHOLD
90    }
91}
92
93/// Centralized format detector for SBOM content.
94///
95/// Provides consistent detection logic for both standard and streaming parsers.
96pub struct FormatDetector {
97    cyclonedx: CycloneDxParser,
98    spdx: SpdxParser,
99    min_confidence: f32,
100}
101
102impl Default for FormatDetector {
103    fn default() -> Self {
104        Self::new()
105    }
106}
107
108impl FormatDetector {
109    /// Create a new format detector with default settings.
110    #[must_use]
111    pub const fn new() -> Self {
112        Self {
113            cyclonedx: CycloneDxParser::new(),
114            spdx: SpdxParser::new(),
115            min_confidence: MIN_CONFIDENCE_THRESHOLD,
116        }
117    }
118
119    /// Create a format detector with a custom confidence threshold.
120    #[must_use]
121    pub const fn with_threshold(min_confidence: f32) -> Self {
122        Self {
123            cyclonedx: CycloneDxParser::new(),
124            spdx: SpdxParser::new(),
125            min_confidence: min_confidence.clamp(0.0, 1.0),
126        }
127    }
128
129    /// Detect format from full content string.
130    ///
131    /// This performs full detection using each parser's `detect()` method.
132    #[must_use]
133    pub fn detect_from_content(&self, content: &str) -> DetectionResult {
134        let cdx_detection = self.cyclonedx.detect(content);
135        let spdx_detection = self.spdx.detect(content);
136
137        self.select_best_parser(cdx_detection, spdx_detection)
138    }
139
140    /// Detect format from peeked bytes (for streaming).
141    ///
142    /// This performs detection using a prefix of the content, suitable for
143    /// streaming parsers that can only peek at the beginning of a file.
144    #[must_use]
145    pub fn detect_from_peek(&self, peek: &[u8]) -> DetectionResult {
146        // Find first non-whitespace byte
147        let first_char = peek.iter().find(|&&b| !b.is_ascii_whitespace());
148
149        match first_char {
150            Some(b'{' | b'<') => {
151                // Convert peek to string for detection
152                let preview = String::from_utf8_lossy(peek);
153
154                // Use actual parser detection methods for consistency
155                let cdx_detection = self.cyclonedx.detect(&preview);
156                let spdx_detection = self.spdx.detect(&preview);
157
158                self.select_best_parser(cdx_detection, spdx_detection)
159            }
160            Some(c) if c.is_ascii_alphabetic() => {
161                // Might be tag-value format (starts with letters like "SPDXVersion:")
162                let preview = String::from_utf8_lossy(peek);
163                let cdx_detection = self.cyclonedx.detect(&preview);
164                let spdx_detection = self.spdx.detect(&preview);
165
166                self.select_best_parser(cdx_detection, spdx_detection)
167            }
168            Some(_) => DetectionResult::unknown("Unrecognized content format"),
169            None => DetectionResult::unknown("Empty content"),
170        }
171    }
172
173    /// Select the best parser based on detection results.
174    ///
175    /// Uses consistent threshold checking and returns an error-like result
176    /// instead of defaulting to a specific parser when ambiguous.
177    fn select_best_parser(
178        &self,
179        cdx_detection: FormatDetection,
180        spdx_detection: FormatDetection,
181    ) -> DetectionResult {
182        let cdx_conf = cdx_detection.confidence.value();
183        let spdx_conf = spdx_detection.confidence.value();
184
185        // Log detection for debugging
186        tracing::debug!(
187            "Format detection: CycloneDX={:.2}, SPDX={:.2}, threshold={:.2}",
188            cdx_conf,
189            spdx_conf,
190            self.min_confidence
191        );
192
193        // Apply consistent threshold and select best parser
194        if cdx_conf >= self.min_confidence && cdx_conf > spdx_conf {
195            DetectionResult::cyclonedx(cdx_detection)
196        } else if spdx_conf >= self.min_confidence {
197            DetectionResult::spdx(spdx_detection)
198        } else {
199            // No default bias - return unknown if neither meets threshold
200            let mut result =
201                DetectionResult::unknown("Could not detect SBOM format with sufficient confidence");
202
203            // Add helpful context about what was detected
204            if cdx_conf > 0.0 {
205                result.warnings.push(format!(
206                    "CycloneDX detection: {:.0}% confidence (threshold: {:.0}%)",
207                    cdx_conf * 100.0,
208                    self.min_confidence * 100.0
209                ));
210            }
211            if spdx_conf > 0.0 {
212                result.warnings.push(format!(
213                    "SPDX detection: {:.0}% confidence (threshold: {:.0}%)",
214                    spdx_conf * 100.0,
215                    self.min_confidence * 100.0
216                ));
217            }
218
219            result
220        }
221    }
222
223    /// Parse content using the detected format.
224    ///
225    /// This combines detection and parsing in a single operation.
226    pub fn parse_str(&self, content: &str) -> Result<NormalizedSbom, ParseError> {
227        let detection = self.detect_from_content(content);
228
229        // Log any warnings
230        for warning in &detection.warnings {
231            tracing::warn!("{}", warning);
232        }
233
234        match detection.parser {
235            Some(ParserKind::CycloneDx) if detection.can_parse() => {
236                self.cyclonedx.parse_str(content)
237            }
238            Some(ParserKind::Spdx) if detection.can_parse() => self.spdx.parse_str(content),
239            _ => Err(ParseError::UnknownFormat(
240                "Could not detect SBOM format. Expected CycloneDX or SPDX.".to_string(),
241            )),
242        }
243    }
244
245    /// Parse from a reader using streaming JSON parsing.
246    ///
247    /// Peeks at the content to detect format, then uses the appropriate
248    /// reader-based parser for memory-efficient parsing.
249    pub fn parse_reader<R: BufRead>(&self, mut reader: R) -> Result<NormalizedSbom, ParseError> {
250        // Peek at the buffer to detect format
251        let peek = reader
252            .fill_buf()
253            .map_err(|e| ParseError::IoError(e.to_string()))?;
254
255        if peek.is_empty() {
256            return Err(ParseError::IoError("Empty content".to_string()));
257        }
258
259        let detection = self.detect_from_peek(peek);
260
261        // Log any warnings
262        for warning in &detection.warnings {
263            tracing::warn!("{}", warning);
264        }
265
266        match detection.parser {
267            Some(ParserKind::CycloneDx) if detection.can_parse() => {
268                // Check if it's XML (needs string-based parsing)
269                let is_xml = detection.variant.as_deref() == Some("XML");
270                if is_xml {
271                    let mut content = String::new();
272                    reader
273                        .read_to_string(&mut content)
274                        .map_err(|e| ParseError::IoError(e.to_string()))?;
275                    self.cyclonedx.parse_str(&content)
276                } else {
277                    self.cyclonedx.parse_json_reader(reader)
278                }
279            }
280            Some(ParserKind::Spdx) if detection.can_parse() => {
281                // Check variant - tag-value and RDF need string-based parsing
282                let needs_string =
283                    matches!(detection.variant.as_deref(), Some("tag-value" | "RDF"));
284                if needs_string {
285                    let mut content = String::new();
286                    reader
287                        .read_to_string(&mut content)
288                        .map_err(|e| ParseError::IoError(e.to_string()))?;
289                    self.spdx.parse_str(&content)
290                } else {
291                    self.spdx.parse_json_reader(reader)
292                }
293            }
294            _ => Err(ParseError::UnknownFormat(
295                "Could not detect SBOM format. Expected CycloneDX or SPDX.".to_string(),
296            )),
297        }
298    }
299
300    /// Get a reference to the `CycloneDX` parser.
301    #[must_use]
302    pub const fn cyclonedx_parser(&self) -> &CycloneDxParser {
303        &self.cyclonedx
304    }
305
306    /// Get a reference to the SPDX parser.
307    #[must_use]
308    pub const fn spdx_parser(&self) -> &SpdxParser {
309        &self.spdx
310    }
311}
312
313#[cfg(test)]
314mod tests {
315    use super::*;
316
317    #[test]
318    fn test_detect_cyclonedx_json() {
319        let detector = FormatDetector::new();
320        let content = r#"{"bomFormat": "CycloneDX", "specVersion": "1.5"}"#;
321        let result = detector.detect_from_content(content);
322
323        assert_eq!(result.parser, Some(ParserKind::CycloneDx));
324        assert!(result.can_parse());
325        assert_eq!(result.variant, Some("JSON".to_string()));
326    }
327
328    #[test]
329    fn test_detect_spdx_json() {
330        let detector = FormatDetector::new();
331        let content = r#"{"spdxVersion": "SPDX-2.3", "SPDXID": "SPDXRef-DOCUMENT"}"#;
332        let result = detector.detect_from_content(content);
333
334        assert_eq!(result.parser, Some(ParserKind::Spdx));
335        assert!(result.can_parse());
336        assert_eq!(result.variant, Some("JSON".to_string()));
337    }
338
339    #[test]
340    fn test_detect_from_peek_cyclonedx() {
341        let detector = FormatDetector::new();
342        let peek = br#"{"bomFormat": "CycloneDX", "specVersion": "1.5", "components": []}"#;
343        let result = detector.detect_from_peek(peek);
344
345        assert_eq!(result.parser, Some(ParserKind::CycloneDx));
346        assert!(result.can_parse());
347    }
348
349    #[test]
350    fn test_detect_unknown_format() {
351        let detector = FormatDetector::new();
352        let content = r#"{"some": "random", "json": "content"}"#;
353        let result = detector.detect_from_content(content);
354
355        assert!(result.parser.is_none());
356        assert!(!result.can_parse());
357    }
358
359    #[test]
360    fn test_no_default_bias() {
361        let detector = FormatDetector::new();
362        // Ambiguous JSON that doesn't match either format
363        let content = r#"{"data": "test"}"#;
364        let result = detector.detect_from_content(content);
365
366        // Should NOT default to CycloneDX or any other format
367        assert!(result.parser.is_none());
368        assert!(!result.can_parse());
369    }
370
371    #[test]
372    fn test_threshold_enforcement() {
373        let detector = FormatDetector::with_threshold(0.5);
374        // Content with low confidence might not pass higher threshold
375        let content = r#"{"specVersion": "1.5", "components": []}"#;
376        let result = detector.detect_from_content(content);
377
378        // If confidence is below 0.5, should not parse
379        if result.confidence.value() < 0.5 {
380            assert!(!result.can_parse());
381        }
382    }
383}