Skip to main content

provenant/parsers/
about.rs

1//! Parser for AboutCode .ABOUT metadata files.
2//!
3//! Extracts package metadata from AboutCode .ABOUT YAML files which describe
4//! software components, licenses, and related information.
5//!
6//! # Supported Formats
7//! - .ABOUT (case-sensitive uppercase extension)
8//!
9//! # Key Features
10//! - YAML-based metadata parsing
11//! - Package URL (purl) parsing for type/namespace extraction
12//! - Owner party information
13//! - File reference tracking (about_resource field)
14//! - License expression extraction
15//! - Flexible field mapping (home_url/homepage_url)
16//!
17//! # Implementation Notes
18//! - Uses serde_yaml for YAML parsing
19//! - Uses packageurl crate for purl parsing
20//! - Extension is case-sensitive and must be uppercase (.ABOUT not .about)
21//! - Type can be overridden by 'type' field or extracted from 'purl' field
22//! - Graceful error handling: logs warnings and returns default on parse failure
23
24use crate::models::{DatasourceId, FileReference, PackageData, PackageType, Party};
25use log::warn;
26use packageurl::PackageUrl;
27use serde_yaml::Value;
28use std::fs;
29use std::path::Path;
30use std::str::FromStr;
31use url::Url;
32
33use super::PackageParser;
34
35const FIELD_TYPE: &str = "type";
36const FIELD_PURL: &str = "purl";
37const FIELD_PACKAGE_URL: &str = "package_url";
38const FIELD_NAMESPACE: &str = "namespace";
39const FIELD_NAME: &str = "name";
40const FIELD_VERSION: &str = "version";
41const FIELD_HOME_URL: &str = "home_url";
42const FIELD_HOMEPAGE_URL: &str = "homepage_url";
43const FIELD_DOWNLOAD_URL: &str = "download_url";
44const FIELD_COPYRIGHT: &str = "copyright";
45const FIELD_LICENSE_EXPRESSION: &str = "license_expression";
46const FIELD_OWNER: &str = "owner";
47const FIELD_ABOUT_RESOURCE: &str = "about_resource";
48
49/// AboutCode .ABOUT file parser.
50///
51/// Parses AboutCode metadata files that contain package information,
52/// licensing, and file references in YAML format.
53pub struct AboutFileParser;
54
55#[derive(Clone)]
56struct InferredAboutIdentity {
57    package_type: PackageType,
58    namespace: Option<String>,
59    name: Option<String>,
60    version: Option<String>,
61}
62
63impl PackageParser for AboutFileParser {
64    const PACKAGE_TYPE: PackageType = PackageType::About;
65
66    fn extract_packages(path: &Path) -> Vec<PackageData> {
67        let yaml = match read_and_parse_yaml(path) {
68            Ok(yaml) => yaml,
69            Err(e) => {
70                warn!("Failed to read or parse .ABOUT file at {:?}: {}", path, e);
71                return vec![default_package_data()];
72            }
73        };
74
75        // Extract type and purl information
76        let about_type = yaml
77            .get(FIELD_TYPE)
78            .and_then(|v| v.as_str())
79            .map(String::from);
80
81        let about_namespace = yaml
82            .get(FIELD_NAMESPACE)
83            .and_then(|v| v.as_str())
84            .map(String::from);
85
86        let purl_string = yaml
87            .get(FIELD_PURL)
88            .and_then(|v| v.as_str())
89            .or_else(|| yaml.get(FIELD_PACKAGE_URL).and_then(|v| v.as_str()))
90            .map(String::from);
91
92        // Parse purl if present
93        let (purl_type, purl_namespace, purl_name, purl_version) =
94            if let Some(ref purl_str) = purl_string {
95                match PackageUrl::from_str(purl_str) {
96                    Ok(purl) => (
97                        Some(purl.ty().to_string()),
98                        purl.namespace().map(String::from),
99                        Some(purl.name().to_string()),
100                        purl.version().map(String::from),
101                    ),
102                    Err(e) => {
103                        warn!("Failed to parse purl '{}': {}", purl_str, e);
104                        (None, None, None, None)
105                    }
106                }
107            } else {
108                (None, None, None, None)
109            };
110
111        let inferred = infer_about_from_download_url(
112            yaml.get(FIELD_DOWNLOAD_URL).and_then(|v| v.as_str()),
113            yaml.get(FIELD_NAME)
114                .and_then(yaml_value_to_string)
115                .as_deref(),
116            yaml.get(FIELD_VERSION)
117                .and_then(yaml_value_to_string)
118                .as_deref(),
119        );
120
121        let package_type = about_type
122            .clone()
123            .or(purl_type)
124            .and_then(|s| s.parse::<crate::models::PackageType>().ok())
125            .or_else(|| inferred.as_ref().map(|identity| identity.package_type))
126            .unwrap_or(Self::PACKAGE_TYPE);
127
128        // Priority: about_namespace > purl_namespace
129        let namespace = about_namespace
130            .clone()
131            .or(purl_namespace.clone())
132            .or_else(|| {
133                inferred
134                    .as_ref()
135                    .and_then(|identity| identity.namespace.clone())
136            });
137
138        // Name and version from YAML or purl
139        let name = yaml
140            .get(FIELD_NAME)
141            .and_then(yaml_value_to_string)
142            .or(purl_name.clone())
143            .or_else(|| inferred.as_ref().and_then(|identity| identity.name.clone()));
144
145        let version = yaml
146            .get(FIELD_VERSION)
147            .and_then(yaml_value_to_string)
148            .or(purl_version.clone())
149            .or_else(|| {
150                inferred
151                    .as_ref()
152                    .and_then(|identity| identity.version.clone())
153            });
154
155        // Homepage URL (two possible field names)
156        let homepage_url = yaml
157            .get(FIELD_HOME_URL)
158            .and_then(|v| v.as_str())
159            .or_else(|| yaml.get(FIELD_HOMEPAGE_URL).and_then(|v| v.as_str()))
160            .map(String::from);
161
162        let download_url = yaml
163            .get(FIELD_DOWNLOAD_URL)
164            .and_then(|v| v.as_str())
165            .map(String::from);
166
167        let copyright = yaml
168            .get(FIELD_COPYRIGHT)
169            .and_then(|v| v.as_str())
170            .map(String::from);
171
172        let extracted_license_statement = yaml
173            .get(FIELD_LICENSE_EXPRESSION)
174            .and_then(|v| v.as_str())
175            .map(String::from);
176
177        let vcs_url = yaml
178            .get(Value::String("vcs_url".to_string()))
179            .and_then(|v| v.as_str())
180            .map(String::from);
181
182        let extra_data = build_extra_data(&yaml);
183
184        let purl = purl_string.or_else(|| {
185            let name = yaml
186                .get(FIELD_NAME)
187                .and_then(yaml_value_to_string)
188                .or(purl_name.clone())
189                .or_else(|| inferred.as_ref().and_then(|identity| identity.name.clone()));
190            let version = yaml
191                .get(FIELD_VERSION)
192                .and_then(yaml_value_to_string)
193                .or(purl_version.clone())
194                .or_else(|| {
195                    inferred
196                        .as_ref()
197                        .and_then(|identity| identity.version.clone())
198                });
199            let namespace = about_namespace.clone().or_else(|| {
200                inferred
201                    .as_ref()
202                    .and_then(|identity| identity.namespace.clone())
203            });
204            build_about_purl(
205                package_type,
206                namespace.as_deref(),
207                name.as_deref(),
208                version.as_deref(),
209            )
210        });
211
212        // Owner party
213        let parties = extract_owner_party(&yaml);
214
215        // File references
216        let file_references = extract_file_references(&yaml);
217
218        vec![PackageData {
219            package_type: Some(package_type),
220            namespace,
221            name,
222            version,
223            qualifiers: None,
224            subpath: None,
225            primary_language: None,
226            description: None,
227            release_date: None,
228            parties,
229            keywords: Vec::new(),
230            homepage_url,
231            download_url,
232            size: None,
233            sha1: None,
234            md5: None,
235            sha256: None,
236            sha512: None,
237            bug_tracking_url: None,
238            code_view_url: None,
239            vcs_url,
240            copyright,
241            holder: None,
242            declared_license_expression: None,
243            declared_license_expression_spdx: None,
244            license_detections: Vec::new(),
245            other_license_expression: None,
246            other_license_expression_spdx: None,
247            other_license_detections: Vec::new(),
248            extracted_license_statement,
249            notice_text: None,
250            source_packages: Vec::new(),
251            file_references,
252            is_private: false,
253            is_virtual: false,
254            extra_data,
255            dependencies: Vec::new(),
256            repository_homepage_url: None,
257            repository_download_url: None,
258            api_data_url: None,
259            datasource_id: Some(DatasourceId::AboutFile),
260            purl,
261        }]
262    }
263
264    fn is_match(path: &Path) -> bool {
265        path.extension()
266            .and_then(|ext| ext.to_str())
267            .is_some_and(|ext| ext == "ABOUT")
268    }
269}
270
271/// Reads and parses a YAML file.
272fn read_and_parse_yaml(path: &Path) -> Result<serde_yaml::Mapping, String> {
273    let content = fs::read_to_string(path).map_err(|e| format!("Failed to read file: {}", e))?;
274
275    let value: Value =
276        serde_yaml::from_str(&content).map_err(|e| format!("Failed to parse YAML: {}", e))?;
277
278    match value {
279        Value::Mapping(map) => Ok(map),
280        _ => Err("Expected YAML mapping at root".to_string()),
281    }
282}
283
284/// Converts a YAML value to a string, handling strings, numbers, and booleans.
285fn yaml_value_to_string(value: &Value) -> Option<String> {
286    match value {
287        Value::String(s) => Some(s.clone()),
288        Value::Number(n) => Some(n.to_string()),
289        Value::Bool(b) => Some(b.to_string()),
290        _ => None,
291    }
292}
293
294/// Extracts owner party information from YAML.
295fn extract_owner_party(yaml: &serde_yaml::Mapping) -> Vec<Party> {
296    let owner = yaml
297        .get(Value::String(FIELD_OWNER.to_string()))
298        .map(|v| match v {
299            Value::String(s) => s.clone(),
300            _ => {
301                // Convert non-string values to their debug representation
302                format!("{:?}", v)
303            }
304        });
305
306    if let Some(owner_name) = owner {
307        if !owner_name.is_empty() {
308            vec![Party {
309                r#type: Some("person".to_string()),
310                role: Some("owner".to_string()),
311                name: Some(owner_name),
312                email: None,
313                url: None,
314                organization: None,
315                organization_url: None,
316                timezone: None,
317            }]
318        } else {
319            Vec::new()
320        }
321    } else {
322        Vec::new()
323    }
324}
325
326/// Extracts file references from YAML.
327fn extract_file_references(yaml: &serde_yaml::Mapping) -> Vec<FileReference> {
328    let about_resource = yaml
329        .get(Value::String(FIELD_ABOUT_RESOURCE.to_string()))
330        .and_then(|v| v.as_str());
331    let license_file = yaml
332        .get(Value::String("license_file".to_string()))
333        .and_then(|v| v.as_str());
334    let notice_file = yaml
335        .get(Value::String("notice_file".to_string()))
336        .and_then(|v| v.as_str());
337
338    let mut refs = Vec::new();
339
340    if let Some(path) = about_resource {
341        refs.push(FileReference {
342            path: path.to_string(),
343            size: None,
344            sha1: None,
345            md5: None,
346            sha256: None,
347            sha512: None,
348            extra_data: None,
349        });
350    }
351
352    for path in [license_file, notice_file].into_iter().flatten() {
353        refs.push(FileReference {
354            path: path.to_string(),
355            size: None,
356            sha1: None,
357            md5: None,
358            sha256: None,
359            sha512: None,
360            extra_data: None,
361        });
362    }
363
364    refs
365}
366
367/// Returns a default (empty) PackageData structure.
368fn default_package_data() -> PackageData {
369    PackageData {
370        package_type: Some(PackageType::About),
371        datasource_id: Some(DatasourceId::AboutFile),
372        ..Default::default()
373    }
374}
375
376fn infer_about_from_download_url(
377    download_url: Option<&str>,
378    about_name: Option<&str>,
379    about_version: Option<&str>,
380) -> Option<InferredAboutIdentity> {
381    let url = Url::parse(download_url?).ok()?;
382    let host = url.host_str()?;
383
384    if matches!(host, "pypi.python.org" | "files.pythonhosted.org") {
385        let name = about_name.map(str::to_string)?;
386        let version = about_version.map(str::to_string);
387        return Some(InferredAboutIdentity {
388            package_type: PackageType::Pypi,
389            namespace: None,
390            name: Some(name),
391            version,
392        });
393    }
394
395    if matches!(host, "raw.githubusercontent.com" | "github.com") {
396        let mut segments = url.path_segments()?;
397        let owner = segments.next()?.to_string();
398        let repo = segments.next()?.to_string();
399        return Some(InferredAboutIdentity {
400            package_type: PackageType::Github,
401            namespace: Some(owner),
402            name: Some(repo),
403            version: None,
404        });
405    }
406
407    None
408}
409
410fn build_about_purl(
411    package_type: PackageType,
412    namespace: Option<&str>,
413    name: Option<&str>,
414    version: Option<&str>,
415) -> Option<String> {
416    if package_type == PackageType::About {
417        return None;
418    }
419
420    let name = name?;
421    let mut purl = PackageUrl::new(package_type.as_str(), name).ok()?;
422    if let Some(namespace) = namespace {
423        purl.with_namespace(namespace).ok()?;
424    }
425    if let Some(version) = version {
426        purl.with_version(version).ok()?;
427    }
428    Some(purl.to_string())
429}
430
431fn build_extra_data(
432    yaml: &serde_yaml::Mapping,
433) -> Option<std::collections::HashMap<String, serde_json::Value>> {
434    let mut extra_data = std::collections::HashMap::new();
435    for key in ["license_file", "notice_file", "notes"] {
436        if let Some(value) = yaml.get(Value::String(key.to_string()))
437            && let Some(value) = yaml_value_to_string(value)
438        {
439            extra_data.insert(key.to_string(), serde_json::Value::String(value));
440        }
441    }
442    (!extra_data.is_empty()).then_some(extra_data)
443}
444
445crate::register_parser!(
446    "AboutCode .ABOUT metadata file",
447    &["**/*.ABOUT"],
448    "about",
449    "",
450    Some("https://aboutcode-toolkit.readthedocs.io/en/latest/specification.html"),
451);