Skip to main content

sbom_model/
lib.rs

1#![doc = include_str!("../readme.md")]
2
3use indexmap::IndexMap;
4use packageurl::PackageUrl;
5use serde::{Deserialize, Serialize};
6use sha2::{Digest, Sha256};
7use std::collections::{BTreeMap, BTreeSet};
8use std::str::FromStr;
9
10/// Format-agnostic SBOM (Software Bill of Materials) representation.
11///
12/// This is the central type that holds all components and their relationships.
13/// It abstracts over format-specific details from CycloneDX, SPDX, and other formats.
14///
15/// # Example
16///
17/// ```
18/// use sbom_model::{Sbom, Component};
19///
20/// let mut sbom = Sbom::default();
21/// let component = Component::new("serde".into(), Some("1.0.0".into()));
22/// sbom.components.insert(component.id.clone(), component);
23/// ```
24#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
25pub struct Sbom {
26    /// Document-level metadata (creation time, tools, authors).
27    pub metadata: Metadata,
28    /// All components indexed by their stable identifier.
29    pub components: IndexMap<ComponentId, Component>,
30    /// Dependency graph as adjacency list: parent -> set of children.
31    pub dependencies: BTreeMap<ComponentId, BTreeSet<ComponentId>>,
32}
33
34impl Default for Sbom {
35    fn default() -> Self {
36        Self {
37            metadata: Metadata::default(),
38            components: IndexMap::new(),
39            dependencies: BTreeMap::new(),
40        }
41    }
42}
43
44/// SBOM document metadata.
45///
46/// Contains information about when and how the SBOM was created.
47/// This data is stripped during normalization since it varies between
48/// tool runs and shouldn't affect diff comparisons.
49#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
50pub struct Metadata {
51    /// ISO 8601 timestamp of document creation.
52    pub timestamp: Option<String>,
53    /// Tools used to generate the SBOM (e.g., "syft", "trivy").
54    pub tools: Vec<String>,
55    /// Document authors or organizations.
56    pub authors: Vec<String>,
57}
58
59/// Stable identifier for a component.
60///
61/// Used as a key in the component map and dependency graph. Prefers package URLs
62/// (purls) when available since they provide globally unique identifiers. Falls
63/// back to a deterministic SHA-256 hash of component properties when no purl exists.
64///
65/// # Example
66///
67/// ```
68/// use sbom_model::ComponentId;
69///
70/// // With a purl (preferred)
71/// let id = ComponentId::new(Some("pkg:npm/lodash@4.17.21"), &[]);
72/// assert_eq!(id.as_str(), "pkg:npm/lodash@4.17.21");
73///
74/// // Without a purl (hash fallback)
75/// let id = ComponentId::new(None, &[("name", "foo"), ("version", "1.0")]);
76/// assert!(id.as_str().starts_with("h:"));
77/// ```
78#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord, Serialize, Deserialize)]
79pub struct ComponentId(String);
80
81impl ComponentId {
82    /// Creates a new identifier from a purl or property hash.
83    ///
84    /// If a purl is provided, it will be canonicalized. Otherwise, a deterministic
85    /// SHA-256 hash is computed from the provided key-value properties.
86    pub fn new(purl: Option<&str>, properties: &[(&str, &str)]) -> Self {
87        if let Some(purl) = purl {
88            // Try to canonicalize purl
89            if let Ok(parsed) = PackageUrl::from_str(purl) {
90                return ComponentId(parsed.to_string());
91            }
92            return ComponentId(purl.to_string());
93        }
94
95        // Deterministic hash fallback
96        let mut hasher = Sha256::new();
97        for (k, v) in properties {
98            hasher.update(k.as_bytes());
99            hasher.update(b":");
100            hasher.update(v.as_bytes());
101            hasher.update(b"|");
102        }
103        let hash = hex::encode(hasher.finalize());
104        ComponentId(format!("h:{}", hash))
105    }
106
107    /// Returns the identifier as a string slice.
108    pub fn as_str(&self) -> &str {
109        &self.0
110    }
111}
112
113impl std::fmt::Display for ComponentId {
114    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
115        write!(f, "{}", self.0)
116    }
117}
118
119/// A software component (package, library, or application).
120///
121/// Represents a single entry in the SBOM with all its metadata.
122/// Components are identified by their [`ComponentId`] and can have
123/// relationships to other components via the dependency graph.
124#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
125pub struct Component {
126    /// Stable identifier for this component.
127    pub id: ComponentId,
128    /// Package name (e.g., "serde", "lodash").
129    pub name: String,
130    /// Package version (e.g., "1.0.0", "4.17.21").
131    pub version: Option<String>,
132    /// Package ecosystem (e.g., "cargo", "npm", "pypi").
133    pub ecosystem: Option<String>,
134    /// Package supplier or publisher.
135    pub supplier: Option<String>,
136    /// Human-readable description.
137    pub description: Option<String>,
138    /// Package URL per the [purl spec](https://github.com/package-url/purl-spec).
139    pub purl: Option<String>,
140    /// SPDX license identifiers (e.g., "MIT", "Apache-2.0").
141    pub licenses: BTreeSet<String>,
142    /// Checksums keyed by algorithm (e.g., "sha256" -> "abc123...").
143    pub hashes: BTreeMap<String, String>,
144    /// Original identifiers from the source document (e.g., SPDX SPDXRef, CycloneDX bom-ref).
145    pub source_ids: Vec<String>,
146}
147
148impl Component {
149    /// Creates a new component with the given name and optional version.
150    ///
151    /// The component ID is generated from a hash of the name and version.
152    /// Use this for simple cases; for full control, construct the struct directly.
153    pub fn new(name: String, version: Option<String>) -> Self {
154        let mut props = vec![("name", name.as_str())];
155        if let Some(v) = &version {
156            props.push(("version", v));
157        }
158        let id = ComponentId::new(None, &props);
159
160        Self {
161            id,
162            name,
163            version,
164            ecosystem: None,
165            supplier: None,
166            description: None,
167            purl: None,
168            licenses: BTreeSet::new(),
169            hashes: BTreeMap::new(),
170            source_ids: Vec::new(),
171        }
172    }
173}
174
175impl Sbom {
176    /// Normalizes the SBOM for deterministic comparison.
177    ///
178    /// This method:
179    /// - Sorts components by ID
180    /// - Deduplicates and sorts licenses within each component
181    /// - Lowercases hash algorithms and values
182    /// - Clears volatile metadata (timestamps, tools, authors)
183    ///
184    /// Call this before comparing two SBOMs to ignore irrelevant differences.
185    pub fn normalize(&mut self) {
186        // Sort components by ID for deterministic output
187        self.components.sort_keys();
188
189        // Sort dependencies
190        for deps in self.dependencies.values_mut() {
191            // BTreeSet is already sorted
192            // But we might want to ensure consistency if we change container types later
193            let _ = deps;
194        }
195
196        // Normalize components
197        for component in self.components.values_mut() {
198            component.normalize();
199        }
200
201        // Strip volatile metadata
202        self.metadata.timestamp = None;
203        self.metadata.tools.clear();
204        self.metadata.authors.clear(); // Authors might be relevant, but often change slightly. Let's keep strict for now.
205    }
206
207    /// Returns root components (those not depended on by any other component).
208    ///
209    /// These are typically the top-level packages or applications in the SBOM.
210    pub fn roots(&self) -> Vec<ComponentId> {
211        let targets: BTreeSet<_> = self.dependencies.values().flatten().collect();
212        self.components
213            .keys()
214            .filter(|id| !targets.contains(id))
215            .cloned()
216            .collect()
217    }
218
219    /// Returns direct dependencies of the given component.
220    pub fn deps(&self, id: &ComponentId) -> Vec<ComponentId> {
221        self.dependencies
222            .get(id)
223            .map(|d| d.iter().cloned().collect())
224            .unwrap_or_default()
225    }
226
227    /// Returns reverse dependencies (components that depend on the given component).
228    pub fn rdeps(&self, id: &ComponentId) -> Vec<ComponentId> {
229        self.dependencies
230            .iter()
231            .filter(|(_, children)| children.contains(id))
232            .map(|(parent, _)| parent.clone())
233            .collect()
234    }
235
236    /// Returns all transitive dependencies of the given component.
237    ///
238    /// Traverses the dependency graph depth-first and returns all reachable components.
239    pub fn transitive_deps(&self, id: &ComponentId) -> BTreeSet<ComponentId> {
240        let mut visited = BTreeSet::new();
241        let mut stack = vec![id.clone()];
242        while let Some(current) = stack.pop() {
243            if let Some(children) = self.dependencies.get(&current) {
244                for child in children {
245                    if visited.insert(child.clone()) {
246                        stack.push(child.clone());
247                    }
248                }
249            }
250        }
251        visited
252    }
253
254    /// Returns all unique ecosystems present in the SBOM.
255    pub fn ecosystems(&self) -> BTreeSet<String> {
256        self.components
257            .values()
258            .filter_map(|c| c.ecosystem.clone())
259            .collect()
260    }
261
262    /// Returns all unique licenses present across all components.
263    pub fn licenses(&self) -> BTreeSet<String> {
264        self.components
265            .values()
266            .flat_map(|c| c.licenses.iter().cloned())
267            .collect()
268    }
269
270    /// Returns components that have no checksums/hashes.
271    ///
272    /// Useful for identifying components that may need integrity verification.
273    pub fn missing_hashes(&self) -> Vec<ComponentId> {
274        self.components
275            .iter()
276            .filter(|(_, c)| c.hashes.is_empty())
277            .map(|(id, _)| id.clone())
278            .collect()
279    }
280
281    /// Finds a component by its package URL.
282    pub fn by_purl(&self, purl: &str) -> Option<&Component> {
283        self.components
284            .values()
285            .find(|c| c.purl.as_deref() == Some(purl))
286    }
287}
288
289impl Component {
290    /// Normalizes the component for deterministic comparison.
291    ///
292    /// Lowercases hash keys and values. Licenses are stored as a BTreeSet
293    /// so they're already sorted and deduplicated.
294    pub fn normalize(&mut self) {
295        let normalized_hashes: BTreeMap<String, String> = self
296            .hashes
297            .iter()
298            .map(|(k, v)| (k.to_lowercase(), v.to_lowercase()))
299            .collect();
300        self.hashes = normalized_hashes;
301    }
302}
303
304/// Extracts the ecosystem (package type) from a purl string.
305///
306/// Returns `None` if the purl is invalid or cannot be parsed.
307///
308/// # Example
309///
310/// ```
311/// use sbom_model::ecosystem_from_purl;
312///
313/// assert_eq!(ecosystem_from_purl("pkg:npm/lodash@4.17.21"), Some("npm".to_string()));
314/// assert_eq!(ecosystem_from_purl("pkg:cargo/serde@1.0.0"), Some("cargo".to_string()));
315/// assert_eq!(ecosystem_from_purl("invalid"), None);
316/// ```
317pub fn ecosystem_from_purl(purl: &str) -> Option<String> {
318    PackageUrl::from_str(purl).ok().map(|p| p.ty().to_string())
319}
320
321/// Extracts individual license IDs from an SPDX expression.
322///
323/// Parses the expression and returns all license IDs found.
324/// If parsing fails, returns the original string as a single-element set.
325///
326/// # Example
327///
328/// ```
329/// use sbom_model::parse_license_expression;
330///
331/// let ids = parse_license_expression("MIT OR Apache-2.0");
332/// assert!(ids.contains("MIT"));
333/// assert!(ids.contains("Apache-2.0"));
334/// ```
335pub fn parse_license_expression(license: &str) -> BTreeSet<String> {
336    match spdx::Expression::parse(license) {
337        Ok(expr) => {
338            let ids: BTreeSet<String> = expr
339                .requirements()
340                .filter_map(|r| r.req.license.id())
341                .map(|id| id.name.to_string())
342                .collect();
343            if ids.is_empty() {
344                // Expression parsed but no IDs found, keep original
345                BTreeSet::from([license.to_string()])
346            } else {
347                ids
348            }
349        }
350        Err(_) => {
351            // Not a valid SPDX expression, keep original
352            BTreeSet::from([license.to_string()])
353        }
354    }
355}
356
357#[cfg(test)]
358mod tests {
359    use super::*;
360
361    #[test]
362    fn test_component_id_purl() {
363        let purl = "pkg:npm/left-pad@1.3.0";
364        let id = ComponentId::new(Some(purl), &[]);
365        assert_eq!(id.as_str(), purl);
366    }
367
368    #[test]
369    fn test_component_id_hash_stability() {
370        let props = [("name", "foo"), ("version", "1.0")];
371        let id1 = ComponentId::new(None, &props);
372        let id2 = ComponentId::new(None, &props);
373        assert_eq!(id1, id2);
374        assert!(id1.as_str().starts_with("h:"));
375    }
376
377    #[test]
378    fn test_normalization() {
379        let mut comp = Component::new("test".to_string(), Some("1.0".to_string()));
380        comp.licenses.insert("MIT".to_string());
381        comp.licenses.insert("Apache-2.0".to_string());
382        comp.hashes.insert("SHA-256".to_string(), "ABC".to_string());
383
384        comp.normalize();
385
386        // BTreeSet is already sorted and deduped
387        assert_eq!(
388            comp.licenses,
389            BTreeSet::from(["Apache-2.0".to_string(), "MIT".to_string()])
390        );
391        assert_eq!(comp.hashes.get("sha-256").unwrap(), "abc");
392    }
393
394    #[test]
395    fn test_parse_license_expression() {
396        // OR expression extracts both IDs
397        let ids = parse_license_expression("MIT OR Apache-2.0");
398        assert!(ids.contains("MIT"));
399        assert!(ids.contains("Apache-2.0"));
400        assert_eq!(ids.len(), 2);
401
402        // Single license
403        let ids = parse_license_expression("MIT");
404        assert_eq!(ids, BTreeSet::from(["MIT".to_string()]));
405
406        // AND expression extracts both IDs
407        let ids = parse_license_expression("MIT AND Apache-2.0");
408        assert!(ids.contains("MIT"));
409        assert!(ids.contains("Apache-2.0"));
410
411        // Invalid expression kept as-is
412        let ids = parse_license_expression("Custom License");
413        assert_eq!(ids, BTreeSet::from(["Custom License".to_string()]));
414
415        // LicenseRef expressions parse but yield no standard IDs
416        let ids = parse_license_expression("LicenseRef-proprietary");
417        assert_eq!(ids, BTreeSet::from(["LicenseRef-proprietary".to_string()]));
418    }
419
420    #[test]
421    fn test_license_set_equality() {
422        // Two components with same licenses in different order are equal
423        let mut c1 = Component::new("test".into(), None);
424        c1.licenses.insert("MIT".into());
425        c1.licenses.insert("Apache-2.0".into());
426
427        let mut c2 = Component::new("test".into(), None);
428        c2.licenses.insert("Apache-2.0".into());
429        c2.licenses.insert("MIT".into());
430
431        assert_eq!(c1.licenses, c2.licenses);
432    }
433
434    #[test]
435    fn test_query_api() {
436        let mut sbom = Sbom::default();
437        let c1 = Component::new("a".into(), Some("1".into()));
438        let c2 = Component::new("b".into(), Some("1".into()));
439        let c3 = Component::new("c".into(), Some("1".into()));
440
441        let id1 = c1.id.clone();
442        let id2 = c2.id.clone();
443        let id3 = c3.id.clone();
444
445        sbom.components.insert(id1.clone(), c1);
446        sbom.components.insert(id2.clone(), c2);
447        sbom.components.insert(id3.clone(), c3);
448
449        // id1 -> id2 -> id3
450        sbom.dependencies
451            .entry(id1.clone())
452            .or_default()
453            .insert(id2.clone());
454        sbom.dependencies
455            .entry(id2.clone())
456            .or_default()
457            .insert(id3.clone());
458
459        assert_eq!(sbom.roots(), vec![id1.clone()]);
460        assert_eq!(sbom.deps(&id1), vec![id2.clone()]);
461        assert_eq!(sbom.rdeps(&id2), vec![id1.clone()]);
462
463        let transitive = sbom.transitive_deps(&id1);
464        assert!(transitive.contains(&id2));
465        assert!(transitive.contains(&id3));
466        assert_eq!(transitive.len(), 2);
467
468        assert_eq!(sbom.missing_hashes().len(), 3);
469    }
470
471    #[test]
472    fn test_ecosystems_query() {
473        let mut sbom = Sbom::default();
474
475        let mut c1 = Component::new("lodash".into(), Some("1.0".into()));
476        c1.ecosystem = Some("npm".into());
477        let mut c2 = Component::new("serde".into(), Some("1.0".into()));
478        c2.ecosystem = Some("cargo".into());
479        let mut c3 = Component::new("other-npm".into(), Some("1.0".into()));
480        c3.ecosystem = Some("npm".into());
481        let c4 = Component::new("no-ecosystem".into(), Some("1.0".into()));
482
483        sbom.components.insert(c1.id.clone(), c1);
484        sbom.components.insert(c2.id.clone(), c2);
485        sbom.components.insert(c3.id.clone(), c3);
486        sbom.components.insert(c4.id.clone(), c4);
487
488        let ecosystems = sbom.ecosystems();
489        assert_eq!(ecosystems.len(), 2);
490        assert!(ecosystems.contains("npm"));
491        assert!(ecosystems.contains("cargo"));
492    }
493
494    #[test]
495    fn test_licenses_query() {
496        let mut sbom = Sbom::default();
497
498        let mut c1 = Component::new("a".into(), Some("1.0".into()));
499        c1.licenses.insert("MIT".into());
500        c1.licenses.insert("Apache-2.0".into());
501        let mut c2 = Component::new("b".into(), Some("1.0".into()));
502        c2.licenses.insert("MIT".into());
503        c2.licenses.insert("GPL-3.0-only".into());
504        let c3 = Component::new("c".into(), Some("1.0".into()));
505
506        sbom.components.insert(c1.id.clone(), c1);
507        sbom.components.insert(c2.id.clone(), c2);
508        sbom.components.insert(c3.id.clone(), c3);
509
510        let licenses = sbom.licenses();
511        assert_eq!(licenses.len(), 3);
512        assert!(licenses.contains("MIT"));
513        assert!(licenses.contains("Apache-2.0"));
514        assert!(licenses.contains("GPL-3.0-only"));
515    }
516
517    #[test]
518    fn test_by_purl() {
519        let mut sbom = Sbom::default();
520
521        let mut c1 = Component::new("lodash".into(), Some("4.17.21".into()));
522        c1.purl = Some("pkg:npm/lodash@4.17.21".into());
523        c1.id = ComponentId::new(c1.purl.as_deref(), &[]);
524        let c2 = Component::new("no-purl".into(), Some("1.0".into()));
525
526        sbom.components.insert(c1.id.clone(), c1);
527        sbom.components.insert(c2.id.clone(), c2);
528
529        let found = sbom.by_purl("pkg:npm/lodash@4.17.21");
530        assert!(found.is_some());
531        assert_eq!(found.unwrap().name, "lodash");
532
533        assert!(sbom.by_purl("pkg:npm/nonexistent@1.0").is_none());
534    }
535
536    #[test]
537    fn test_component_id_unparseable_purl() {
538        // A purl string that can't be parsed should still be used as-is
539        let id = ComponentId::new(Some("not-a-valid-purl-but-still-a-string"), &[]);
540        assert_eq!(id.as_str(), "not-a-valid-purl-but-still-a-string");
541    }
542
543    #[test]
544    fn test_component_id_display() {
545        let id = ComponentId::new(Some("pkg:npm/foo@1.0"), &[]);
546        assert_eq!(format!("{}", id), "pkg:npm/foo@1.0");
547    }
548
549    #[test]
550    fn test_sbom_normalize_clears_metadata() {
551        let mut sbom = Sbom::default();
552        sbom.metadata.timestamp = Some("2024-01-01T00:00:00Z".into());
553        sbom.metadata.tools.push("syft".into());
554        sbom.metadata.authors.push("alice".into());
555
556        let c = Component::new("a".into(), Some("1".into()));
557        sbom.components.insert(c.id.clone(), c);
558
559        sbom.normalize();
560
561        assert!(sbom.metadata.timestamp.is_none());
562        assert!(sbom.metadata.tools.is_empty());
563        assert!(sbom.metadata.authors.is_empty());
564    }
565
566    #[test]
567    fn test_missing_hashes_mixed() {
568        let mut sbom = Sbom::default();
569
570        let c1 = Component::new("no-hash".into(), Some("1.0".into()));
571        let mut c2 = Component::new("has-hash".into(), Some("1.0".into()));
572        c2.hashes.insert("sha256".into(), "abc".into());
573
574        sbom.components.insert(c1.id.clone(), c1);
575        sbom.components.insert(c2.id.clone(), c2);
576
577        let missing = sbom.missing_hashes();
578        assert_eq!(missing.len(), 1);
579    }
580
581    #[test]
582    fn test_ecosystem_from_purl() {
583        use super::ecosystem_from_purl;
584
585        assert_eq!(
586            ecosystem_from_purl("pkg:npm/lodash@4.17.21"),
587            Some("npm".to_string())
588        );
589        assert_eq!(
590            ecosystem_from_purl("pkg:cargo/serde@1.0.0"),
591            Some("cargo".to_string())
592        );
593        assert_eq!(
594            ecosystem_from_purl("pkg:pypi/requests@2.28.0"),
595            Some("pypi".to_string())
596        );
597        assert_eq!(
598            ecosystem_from_purl("pkg:maven/org.apache/commons@1.0"),
599            Some("maven".to_string())
600        );
601        assert_eq!(ecosystem_from_purl("invalid-purl"), None);
602        assert_eq!(ecosystem_from_purl(""), None);
603    }
604}