Skip to main content

provenant/parsers/
about.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Parser for AboutCode .ABOUT metadata files.
5//!
6//! Extracts package metadata from AboutCode .ABOUT YAML files which describe
7//! software components, licenses, and related information.
8//!
9//! # Supported Formats
10//! - .ABOUT (case-sensitive uppercase extension)
11//!
12//! # Key Features
13//! - YAML-based metadata parsing
14//! - Package URL (purl) parsing for type/namespace extraction
15//! - Owner party information
16//! - File reference tracking (about_resource field)
17//! - License expression extraction
18//! - Flexible field mapping (home_url/homepage_url)
19//!
20//! # Implementation Notes
21//! - Uses yaml_serde for YAML parsing
22//! - Uses packageurl crate for purl parsing
23//! - Extension is case-sensitive and must be uppercase (.ABOUT not .about)
24//! - Type can be overridden by 'type' field or extracted from 'purl' field
25//! - Graceful error handling: logs warnings and returns default on parse failure
26
27use crate::models::{DatasourceId, FileReference, PackageData, PackageType, Party};
28use crate::parser_warn as warn;
29use crate::parsers::utils::{read_file_to_string, truncate_field};
30use packageurl::PackageUrl;
31use std::path::Path;
32use std::str::FromStr;
33use url::Url;
34use yaml_serde::Value;
35
36use super::PackageParser;
37use super::license_normalization::{
38    DeclaredLicenseMatchMetadata, build_declared_license_data, normalize_spdx_declared_license,
39    normalize_spdx_expression,
40};
41
42const FIELD_TYPE: &str = "type";
43const FIELD_PURL: &str = "purl";
44const FIELD_PACKAGE_URL: &str = "package_url";
45const FIELD_NAMESPACE: &str = "namespace";
46const FIELD_NAME: &str = "name";
47const FIELD_VERSION: &str = "version";
48const FIELD_HOME_URL: &str = "home_url";
49const FIELD_HOMEPAGE_URL: &str = "homepage_url";
50const FIELD_DOWNLOAD_URL: &str = "download_url";
51const FIELD_COPYRIGHT: &str = "copyright";
52const FIELD_LICENSE_EXPRESSION: &str = "license_expression";
53const FIELD_OWNER: &str = "owner";
54const FIELD_ABOUT_RESOURCE: &str = "about_resource";
55
56/// AboutCode .ABOUT file parser.
57///
58/// Parses AboutCode metadata files that contain package information,
59/// licensing, and file references in YAML format.
60pub struct AboutFileParser;
61
62#[derive(Clone)]
63struct InferredAboutIdentity {
64    package_type: PackageType,
65    namespace: Option<String>,
66    name: Option<String>,
67    version: Option<String>,
68}
69
70impl PackageParser for AboutFileParser {
71    const PACKAGE_TYPE: PackageType = PackageType::About;
72
73    fn extract_packages(path: &Path) -> Vec<PackageData> {
74        let yaml = match read_and_parse_yaml(path) {
75            Ok(yaml) => yaml,
76            Err(e) => {
77                warn!("Failed to read or parse .ABOUT file at {:?}: {}", path, e);
78                return vec![default_package_data()];
79            }
80        };
81
82        // Extract type and purl information
83        let about_type = yaml
84            .get(FIELD_TYPE)
85            .and_then(|v| v.as_str())
86            .map(String::from);
87
88        let about_namespace = yaml
89            .get(FIELD_NAMESPACE)
90            .and_then(|v| v.as_str())
91            .map(|v| truncate_field(v.to_string()));
92
93        let purl_string = yaml
94            .get(FIELD_PURL)
95            .and_then(|v| v.as_str())
96            .or_else(|| yaml.get(FIELD_PACKAGE_URL).and_then(|v| v.as_str()))
97            .map(|v| truncate_field(v.to_string()));
98
99        // Parse purl if present
100        let (purl_type, purl_namespace, purl_name, purl_version) =
101            if let Some(ref purl_str) = purl_string {
102                match PackageUrl::from_str(purl_str) {
103                    Ok(purl) => (
104                        Some(truncate_field(purl.ty().to_string())),
105                        purl.namespace().map(|v| truncate_field(v.to_string())),
106                        Some(truncate_field(purl.name().to_string())),
107                        purl.version().map(|v| truncate_field(v.to_string())),
108                    ),
109                    Err(e) => {
110                        warn!("Failed to parse purl '{}': {}", purl_str, e);
111                        (None, None, None, None)
112                    }
113                }
114            } else {
115                (None, None, None, None)
116            };
117
118        let inferred = infer_about_from_download_url(
119            yaml.get(FIELD_DOWNLOAD_URL).and_then(|v| v.as_str()),
120            yaml.get(FIELD_NAME)
121                .and_then(yaml_value_to_string)
122                .as_deref(),
123            yaml.get(FIELD_VERSION)
124                .and_then(yaml_value_to_string)
125                .as_deref(),
126        );
127
128        let package_type = about_type
129            .clone()
130            .or(purl_type)
131            .and_then(|s| s.parse::<crate::models::PackageType>().ok())
132            .or_else(|| inferred.as_ref().map(|identity| identity.package_type))
133            .unwrap_or(Self::PACKAGE_TYPE);
134
135        // Priority: about_namespace > purl_namespace
136        let namespace = about_namespace
137            .clone()
138            .or(purl_namespace.clone())
139            .or_else(|| {
140                inferred
141                    .as_ref()
142                    .and_then(|identity| identity.namespace.clone())
143            })
144            .map(truncate_field);
145
146        // Name and version from YAML or purl
147        let name = yaml
148            .get(FIELD_NAME)
149            .and_then(yaml_value_to_string)
150            .or(purl_name.clone())
151            .or_else(|| inferred.as_ref().and_then(|identity| identity.name.clone()))
152            .map(truncate_field);
153
154        let version = yaml
155            .get(FIELD_VERSION)
156            .and_then(yaml_value_to_string)
157            .or(purl_version.clone())
158            .or_else(|| {
159                inferred
160                    .as_ref()
161                    .and_then(|identity| identity.version.clone())
162            })
163            .map(truncate_field);
164
165        // Homepage URL (two possible field names)
166        let homepage_url = yaml
167            .get(FIELD_HOME_URL)
168            .and_then(|v| v.as_str())
169            .or_else(|| yaml.get(FIELD_HOMEPAGE_URL).and_then(|v| v.as_str()))
170            .map(|v| truncate_field(v.to_string()));
171
172        let download_url = yaml
173            .get(FIELD_DOWNLOAD_URL)
174            .and_then(|v| v.as_str())
175            .map(|v| truncate_field(v.to_string()));
176
177        let copyright = yaml
178            .get(FIELD_COPYRIGHT)
179            .and_then(|v| v.as_str())
180            .map(|v| truncate_field(v.to_string()));
181
182        let extracted_license_statement = yaml
183            .get(FIELD_LICENSE_EXPRESSION)
184            .and_then(|v| v.as_str())
185            .map(|v| truncate_field(v.to_string()));
186        let file_references = extract_file_references(&yaml);
187        let (declared_license_expression, declared_license_expression_spdx, license_detections) =
188            extracted_license_statement
189                .as_deref()
190                .and_then(normalize_spdx_expression)
191                .map(|normalized| {
192                    build_declared_license_data(
193                        normalized,
194                        DeclaredLicenseMatchMetadata::single_line(
195                            extracted_license_statement.as_deref().unwrap_or_default(),
196                        ),
197                    )
198                })
199                .unwrap_or_else(|| {
200                    normalize_spdx_declared_license(extracted_license_statement.as_deref())
201                });
202
203        let vcs_url = yaml
204            .get(Value::String("vcs_url".to_string()))
205            .and_then(|v| v.as_str())
206            .map(|v| truncate_field(v.to_string()));
207
208        let extra_data = build_extra_data(&yaml);
209
210        let purl = purl_string
211            .or_else(|| {
212                let name = yaml
213                    .get(FIELD_NAME)
214                    .and_then(yaml_value_to_string)
215                    .or(purl_name.clone())
216                    .or_else(|| inferred.as_ref().and_then(|identity| identity.name.clone()));
217                let version = yaml
218                    .get(FIELD_VERSION)
219                    .and_then(yaml_value_to_string)
220                    .or(purl_version.clone())
221                    .or_else(|| {
222                        inferred
223                            .as_ref()
224                            .and_then(|identity| identity.version.clone())
225                    });
226                let namespace = about_namespace.clone().or_else(|| {
227                    inferred
228                        .as_ref()
229                        .and_then(|identity| identity.namespace.clone())
230                });
231                build_about_purl(
232                    package_type,
233                    namespace.as_deref(),
234                    name.as_deref(),
235                    version.as_deref(),
236                )
237            })
238            .map(truncate_field);
239
240        // Owner party
241        let parties = extract_owner_party(&yaml);
242
243        // File references
244        vec![PackageData {
245            package_type: Some(package_type),
246            namespace,
247            name,
248            version,
249            qualifiers: None,
250            subpath: None,
251            primary_language: None,
252            description: None,
253            release_date: None,
254            parties,
255            keywords: Vec::new(),
256            homepage_url,
257            download_url,
258            size: None,
259            sha1: None,
260            md5: None,
261            sha256: None,
262            sha512: None,
263            bug_tracking_url: None,
264            code_view_url: None,
265            vcs_url,
266            copyright,
267            holder: None,
268            declared_license_expression,
269            declared_license_expression_spdx,
270            license_detections,
271            other_license_expression: None,
272            other_license_expression_spdx: None,
273            other_license_detections: Vec::new(),
274            extracted_license_statement,
275            notice_text: None,
276            source_packages: Vec::new(),
277            file_references,
278            is_private: false,
279            is_virtual: false,
280            extra_data,
281            dependencies: Vec::new(),
282            repository_homepage_url: None,
283            repository_download_url: None,
284            api_data_url: None,
285            datasource_id: Some(DatasourceId::AboutFile),
286            purl,
287        }]
288    }
289
290    fn is_match(path: &Path) -> bool {
291        path.extension()
292            .and_then(|ext| ext.to_str())
293            .is_some_and(|ext| ext == "ABOUT")
294    }
295}
296
297/// Reads and parses a YAML file.
298fn read_and_parse_yaml(path: &Path) -> Result<yaml_serde::Mapping, String> {
299    let content =
300        read_file_to_string(path, None).map_err(|e| format!("Failed to read file: {}", e))?;
301
302    let value: Value =
303        yaml_serde::from_str(&content).map_err(|e| format!("Failed to parse YAML: {}", e))?;
304
305    match value {
306        Value::Mapping(map) => Ok(map),
307        _ => Err("Expected YAML mapping at root".to_string()),
308    }
309}
310
311/// Converts a YAML value to a string, handling strings, numbers, and booleans.
312fn yaml_value_to_string(value: &Value) -> Option<String> {
313    match value {
314        Value::String(s) => Some(s.clone()),
315        Value::Number(n) => Some(n.to_string()),
316        Value::Bool(b) => Some(b.to_string()),
317        _ => None,
318    }
319}
320
321/// Extracts owner party information from YAML.
322fn extract_owner_party(yaml: &yaml_serde::Mapping) -> Vec<Party> {
323    let owner = yaml
324        .get(Value::String(FIELD_OWNER.to_string()))
325        .map(|v| match v {
326            Value::String(s) => truncate_field(s.clone()),
327            _ => truncate_field(format!("{:?}", v)),
328        });
329
330    if let Some(owner_name) = owner {
331        if !owner_name.is_empty() {
332            vec![Party {
333                r#type: Some("person".to_string()),
334                role: Some("owner".to_string()),
335                name: Some(owner_name),
336                email: None,
337                url: None,
338                organization: None,
339                organization_url: None,
340                timezone: None,
341            }]
342        } else {
343            Vec::new()
344        }
345    } else {
346        Vec::new()
347    }
348}
349
350/// Extracts file references from YAML.
351fn extract_file_references(yaml: &yaml_serde::Mapping) -> Vec<FileReference> {
352    let about_resource = yaml
353        .get(Value::String(FIELD_ABOUT_RESOURCE.to_string()))
354        .and_then(|v| v.as_str());
355    let license_file = yaml
356        .get(Value::String("license_file".to_string()))
357        .and_then(|v| v.as_str());
358    let notice_file = yaml
359        .get(Value::String("notice_file".to_string()))
360        .and_then(|v| v.as_str());
361
362    let mut refs = Vec::new();
363
364    if let Some(path) = about_resource {
365        refs.push(FileReference {
366            path: truncate_field(path.to_string()),
367            size: None,
368            sha1: None,
369            md5: None,
370            sha256: None,
371            sha512: None,
372            extra_data: None,
373        });
374    }
375
376    for path in [license_file, notice_file].into_iter().flatten() {
377        refs.push(FileReference {
378            path: truncate_field(path.to_string()),
379            size: None,
380            sha1: None,
381            md5: None,
382            sha256: None,
383            sha512: None,
384            extra_data: None,
385        });
386    }
387
388    refs
389}
390
391/// Returns a default (empty) PackageData structure.
392fn default_package_data() -> PackageData {
393    PackageData {
394        package_type: Some(PackageType::About),
395        datasource_id: Some(DatasourceId::AboutFile),
396        ..Default::default()
397    }
398}
399
400fn infer_about_from_download_url(
401    download_url: Option<&str>,
402    about_name: Option<&str>,
403    about_version: Option<&str>,
404) -> Option<InferredAboutIdentity> {
405    let url = Url::parse(download_url?).ok()?;
406    let host = url.host_str()?;
407
408    if matches!(host, "pypi.python.org" | "files.pythonhosted.org") {
409        let name = about_name.map(str::to_string)?;
410        let version = about_version.map(str::to_string);
411        return Some(InferredAboutIdentity {
412            package_type: PackageType::Pypi,
413            namespace: None,
414            name: Some(name),
415            version,
416        });
417    }
418
419    if matches!(host, "raw.githubusercontent.com" | "github.com") {
420        let mut segments = url.path_segments()?;
421        let owner = segments.next()?.to_string();
422        let repo = segments.next()?.to_string();
423        return Some(InferredAboutIdentity {
424            package_type: PackageType::Github,
425            namespace: Some(owner),
426            name: Some(repo),
427            version: None,
428        });
429    }
430
431    None
432}
433
434fn build_about_purl(
435    package_type: PackageType,
436    namespace: Option<&str>,
437    name: Option<&str>,
438    version: Option<&str>,
439) -> Option<String> {
440    if package_type == PackageType::About {
441        return None;
442    }
443
444    let name = name?;
445    let mut purl = PackageUrl::new(package_type.as_str(), name).ok()?;
446    if let Some(namespace) = namespace {
447        purl.with_namespace(namespace).ok()?;
448    }
449    if let Some(version) = version {
450        purl.with_version(version).ok()?;
451    }
452    Some(purl.to_string())
453}
454
455fn build_extra_data(
456    yaml: &yaml_serde::Mapping,
457) -> Option<std::collections::HashMap<String, serde_json::Value>> {
458    let mut extra_data = std::collections::HashMap::new();
459    for key in ["license_file", "notice_file", "notes"] {
460        if let Some(value) = yaml.get(Value::String(key.to_string()))
461            && let Some(value) = yaml_value_to_string(value)
462        {
463            extra_data.insert(
464                key.to_string(),
465                serde_json::Value::String(truncate_field(value)),
466            );
467        }
468    }
469    (!extra_data.is_empty()).then_some(extra_data)
470}
471
472crate::register_parser!(
473    "AboutCode .ABOUT metadata file",
474    &["**/*.ABOUT"],
475    "about",
476    "",
477    Some("https://aboutcode-toolkit.readthedocs.io/en/latest/specification.html"),
478);