Skip to main content

provenant/parsers/
about.rs

1//! Parser for AboutCode .ABOUT metadata files.
2//!
3//! Extracts package metadata from AboutCode .ABOUT YAML files which describe
4//! software components, licenses, and related information.
5//!
6//! # Supported Formats
7//! - .ABOUT (case-sensitive uppercase extension)
8//!
9//! # Key Features
10//! - YAML-based metadata parsing
11//! - Package URL (purl) parsing for type/namespace extraction
12//! - Owner party information
13//! - File reference tracking (about_resource field)
14//! - License expression extraction
15//! - Flexible field mapping (home_url/homepage_url)
16//!
17//! # Implementation Notes
18//! - Uses yaml_serde for YAML parsing
19//! - Uses packageurl crate for purl parsing
20//! - Extension is case-sensitive and must be uppercase (.ABOUT not .about)
21//! - Type can be overridden by 'type' field or extracted from 'purl' field
22//! - Graceful error handling: logs warnings and returns default on parse failure
23
24use crate::models::{DatasourceId, FileReference, PackageData, PackageType, Party};
25use crate::parser_warn as warn;
26use packageurl::PackageUrl;
27use std::fs;
28use std::path::Path;
29use std::str::FromStr;
30use url::Url;
31use yaml_serde::Value;
32
33use super::PackageParser;
34use super::license_normalization::{
35    DeclaredLicenseMatchMetadata, build_declared_license_data, normalize_spdx_declared_license,
36    normalize_spdx_expression,
37};
38
39const FIELD_TYPE: &str = "type";
40const FIELD_PURL: &str = "purl";
41const FIELD_PACKAGE_URL: &str = "package_url";
42const FIELD_NAMESPACE: &str = "namespace";
43const FIELD_NAME: &str = "name";
44const FIELD_VERSION: &str = "version";
45const FIELD_HOME_URL: &str = "home_url";
46const FIELD_HOMEPAGE_URL: &str = "homepage_url";
47const FIELD_DOWNLOAD_URL: &str = "download_url";
48const FIELD_COPYRIGHT: &str = "copyright";
49const FIELD_LICENSE_EXPRESSION: &str = "license_expression";
50const FIELD_OWNER: &str = "owner";
51const FIELD_ABOUT_RESOURCE: &str = "about_resource";
52
53/// AboutCode .ABOUT file parser.
54///
55/// Parses AboutCode metadata files that contain package information,
56/// licensing, and file references in YAML format.
57pub struct AboutFileParser;
58
59#[derive(Clone)]
60struct InferredAboutIdentity {
61    package_type: PackageType,
62    namespace: Option<String>,
63    name: Option<String>,
64    version: Option<String>,
65}
66
67impl PackageParser for AboutFileParser {
68    const PACKAGE_TYPE: PackageType = PackageType::About;
69
70    fn extract_packages(path: &Path) -> Vec<PackageData> {
71        let yaml = match read_and_parse_yaml(path) {
72            Ok(yaml) => yaml,
73            Err(e) => {
74                warn!("Failed to read or parse .ABOUT file at {:?}: {}", path, e);
75                return vec![default_package_data()];
76            }
77        };
78
79        // Extract type and purl information
80        let about_type = yaml
81            .get(FIELD_TYPE)
82            .and_then(|v| v.as_str())
83            .map(String::from);
84
85        let about_namespace = yaml
86            .get(FIELD_NAMESPACE)
87            .and_then(|v| v.as_str())
88            .map(String::from);
89
90        let purl_string = yaml
91            .get(FIELD_PURL)
92            .and_then(|v| v.as_str())
93            .or_else(|| yaml.get(FIELD_PACKAGE_URL).and_then(|v| v.as_str()))
94            .map(String::from);
95
96        // Parse purl if present
97        let (purl_type, purl_namespace, purl_name, purl_version) =
98            if let Some(ref purl_str) = purl_string {
99                match PackageUrl::from_str(purl_str) {
100                    Ok(purl) => (
101                        Some(purl.ty().to_string()),
102                        purl.namespace().map(String::from),
103                        Some(purl.name().to_string()),
104                        purl.version().map(String::from),
105                    ),
106                    Err(e) => {
107                        warn!("Failed to parse purl '{}': {}", purl_str, e);
108                        (None, None, None, None)
109                    }
110                }
111            } else {
112                (None, None, None, None)
113            };
114
115        let inferred = infer_about_from_download_url(
116            yaml.get(FIELD_DOWNLOAD_URL).and_then(|v| v.as_str()),
117            yaml.get(FIELD_NAME)
118                .and_then(yaml_value_to_string)
119                .as_deref(),
120            yaml.get(FIELD_VERSION)
121                .and_then(yaml_value_to_string)
122                .as_deref(),
123        );
124
125        let package_type = about_type
126            .clone()
127            .or(purl_type)
128            .and_then(|s| s.parse::<crate::models::PackageType>().ok())
129            .or_else(|| inferred.as_ref().map(|identity| identity.package_type))
130            .unwrap_or(Self::PACKAGE_TYPE);
131
132        // Priority: about_namespace > purl_namespace
133        let namespace = about_namespace
134            .clone()
135            .or(purl_namespace.clone())
136            .or_else(|| {
137                inferred
138                    .as_ref()
139                    .and_then(|identity| identity.namespace.clone())
140            });
141
142        // Name and version from YAML or purl
143        let name = yaml
144            .get(FIELD_NAME)
145            .and_then(yaml_value_to_string)
146            .or(purl_name.clone())
147            .or_else(|| inferred.as_ref().and_then(|identity| identity.name.clone()));
148
149        let version = yaml
150            .get(FIELD_VERSION)
151            .and_then(yaml_value_to_string)
152            .or(purl_version.clone())
153            .or_else(|| {
154                inferred
155                    .as_ref()
156                    .and_then(|identity| identity.version.clone())
157            });
158
159        // Homepage URL (two possible field names)
160        let homepage_url = yaml
161            .get(FIELD_HOME_URL)
162            .and_then(|v| v.as_str())
163            .or_else(|| yaml.get(FIELD_HOMEPAGE_URL).and_then(|v| v.as_str()))
164            .map(String::from);
165
166        let download_url = yaml
167            .get(FIELD_DOWNLOAD_URL)
168            .and_then(|v| v.as_str())
169            .map(String::from);
170
171        let copyright = yaml
172            .get(FIELD_COPYRIGHT)
173            .and_then(|v| v.as_str())
174            .map(String::from);
175
176        let extracted_license_statement = yaml
177            .get(FIELD_LICENSE_EXPRESSION)
178            .and_then(|v| v.as_str())
179            .map(String::from);
180        let file_references = extract_file_references(&yaml);
181        let (declared_license_expression, declared_license_expression_spdx, license_detections) =
182            extracted_license_statement
183                .as_deref()
184                .and_then(normalize_spdx_expression)
185                .map(|normalized| {
186                    build_declared_license_data(
187                        normalized,
188                        DeclaredLicenseMatchMetadata::single_line(
189                            extracted_license_statement.as_deref().unwrap_or_default(),
190                        ),
191                    )
192                })
193                .unwrap_or_else(|| {
194                    normalize_spdx_declared_license(extracted_license_statement.as_deref())
195                });
196
197        let vcs_url = yaml
198            .get(Value::String("vcs_url".to_string()))
199            .and_then(|v| v.as_str())
200            .map(String::from);
201
202        let extra_data = build_extra_data(&yaml);
203
204        let purl = purl_string.or_else(|| {
205            let name = yaml
206                .get(FIELD_NAME)
207                .and_then(yaml_value_to_string)
208                .or(purl_name.clone())
209                .or_else(|| inferred.as_ref().and_then(|identity| identity.name.clone()));
210            let version = yaml
211                .get(FIELD_VERSION)
212                .and_then(yaml_value_to_string)
213                .or(purl_version.clone())
214                .or_else(|| {
215                    inferred
216                        .as_ref()
217                        .and_then(|identity| identity.version.clone())
218                });
219            let namespace = about_namespace.clone().or_else(|| {
220                inferred
221                    .as_ref()
222                    .and_then(|identity| identity.namespace.clone())
223            });
224            build_about_purl(
225                package_type,
226                namespace.as_deref(),
227                name.as_deref(),
228                version.as_deref(),
229            )
230        });
231
232        // Owner party
233        let parties = extract_owner_party(&yaml);
234
235        // File references
236        vec![PackageData {
237            package_type: Some(package_type),
238            namespace,
239            name,
240            version,
241            qualifiers: None,
242            subpath: None,
243            primary_language: None,
244            description: None,
245            release_date: None,
246            parties,
247            keywords: Vec::new(),
248            homepage_url,
249            download_url,
250            size: None,
251            sha1: None,
252            md5: None,
253            sha256: None,
254            sha512: None,
255            bug_tracking_url: None,
256            code_view_url: None,
257            vcs_url,
258            copyright,
259            holder: None,
260            declared_license_expression,
261            declared_license_expression_spdx,
262            license_detections,
263            other_license_expression: None,
264            other_license_expression_spdx: None,
265            other_license_detections: Vec::new(),
266            extracted_license_statement,
267            notice_text: None,
268            source_packages: Vec::new(),
269            file_references,
270            is_private: false,
271            is_virtual: false,
272            extra_data,
273            dependencies: Vec::new(),
274            repository_homepage_url: None,
275            repository_download_url: None,
276            api_data_url: None,
277            datasource_id: Some(DatasourceId::AboutFile),
278            purl,
279        }]
280    }
281
282    fn is_match(path: &Path) -> bool {
283        path.extension()
284            .and_then(|ext| ext.to_str())
285            .is_some_and(|ext| ext == "ABOUT")
286    }
287}
288
289/// Reads and parses a YAML file.
290fn read_and_parse_yaml(path: &Path) -> Result<yaml_serde::Mapping, String> {
291    let content = fs::read_to_string(path).map_err(|e| format!("Failed to read file: {}", e))?;
292
293    let value: Value =
294        yaml_serde::from_str(&content).map_err(|e| format!("Failed to parse YAML: {}", e))?;
295
296    match value {
297        Value::Mapping(map) => Ok(map),
298        _ => Err("Expected YAML mapping at root".to_string()),
299    }
300}
301
302/// Converts a YAML value to a string, handling strings, numbers, and booleans.
303fn yaml_value_to_string(value: &Value) -> Option<String> {
304    match value {
305        Value::String(s) => Some(s.clone()),
306        Value::Number(n) => Some(n.to_string()),
307        Value::Bool(b) => Some(b.to_string()),
308        _ => None,
309    }
310}
311
312/// Extracts owner party information from YAML.
313fn extract_owner_party(yaml: &yaml_serde::Mapping) -> Vec<Party> {
314    let owner = yaml
315        .get(Value::String(FIELD_OWNER.to_string()))
316        .map(|v| match v {
317            Value::String(s) => s.clone(),
318            _ => {
319                // Convert non-string values to their debug representation
320                format!("{:?}", v)
321            }
322        });
323
324    if let Some(owner_name) = owner {
325        if !owner_name.is_empty() {
326            vec![Party {
327                r#type: Some("person".to_string()),
328                role: Some("owner".to_string()),
329                name: Some(owner_name),
330                email: None,
331                url: None,
332                organization: None,
333                organization_url: None,
334                timezone: None,
335            }]
336        } else {
337            Vec::new()
338        }
339    } else {
340        Vec::new()
341    }
342}
343
344/// Extracts file references from YAML.
345fn extract_file_references(yaml: &yaml_serde::Mapping) -> Vec<FileReference> {
346    let about_resource = yaml
347        .get(Value::String(FIELD_ABOUT_RESOURCE.to_string()))
348        .and_then(|v| v.as_str());
349    let license_file = yaml
350        .get(Value::String("license_file".to_string()))
351        .and_then(|v| v.as_str());
352    let notice_file = yaml
353        .get(Value::String("notice_file".to_string()))
354        .and_then(|v| v.as_str());
355
356    let mut refs = Vec::new();
357
358    if let Some(path) = about_resource {
359        refs.push(FileReference {
360            path: path.to_string(),
361            size: None,
362            sha1: None,
363            md5: None,
364            sha256: None,
365            sha512: None,
366            extra_data: None,
367        });
368    }
369
370    for path in [license_file, notice_file].into_iter().flatten() {
371        refs.push(FileReference {
372            path: path.to_string(),
373            size: None,
374            sha1: None,
375            md5: None,
376            sha256: None,
377            sha512: None,
378            extra_data: None,
379        });
380    }
381
382    refs
383}
384
385/// Returns a default (empty) PackageData structure.
386fn default_package_data() -> PackageData {
387    PackageData {
388        package_type: Some(PackageType::About),
389        datasource_id: Some(DatasourceId::AboutFile),
390        ..Default::default()
391    }
392}
393
394fn infer_about_from_download_url(
395    download_url: Option<&str>,
396    about_name: Option<&str>,
397    about_version: Option<&str>,
398) -> Option<InferredAboutIdentity> {
399    let url = Url::parse(download_url?).ok()?;
400    let host = url.host_str()?;
401
402    if matches!(host, "pypi.python.org" | "files.pythonhosted.org") {
403        let name = about_name.map(str::to_string)?;
404        let version = about_version.map(str::to_string);
405        return Some(InferredAboutIdentity {
406            package_type: PackageType::Pypi,
407            namespace: None,
408            name: Some(name),
409            version,
410        });
411    }
412
413    if matches!(host, "raw.githubusercontent.com" | "github.com") {
414        let mut segments = url.path_segments()?;
415        let owner = segments.next()?.to_string();
416        let repo = segments.next()?.to_string();
417        return Some(InferredAboutIdentity {
418            package_type: PackageType::Github,
419            namespace: Some(owner),
420            name: Some(repo),
421            version: None,
422        });
423    }
424
425    None
426}
427
428fn build_about_purl(
429    package_type: PackageType,
430    namespace: Option<&str>,
431    name: Option<&str>,
432    version: Option<&str>,
433) -> Option<String> {
434    if package_type == PackageType::About {
435        return None;
436    }
437
438    let name = name?;
439    let mut purl = PackageUrl::new(package_type.as_str(), name).ok()?;
440    if let Some(namespace) = namespace {
441        purl.with_namespace(namespace).ok()?;
442    }
443    if let Some(version) = version {
444        purl.with_version(version).ok()?;
445    }
446    Some(purl.to_string())
447}
448
449fn build_extra_data(
450    yaml: &yaml_serde::Mapping,
451) -> Option<std::collections::HashMap<String, serde_json::Value>> {
452    let mut extra_data = std::collections::HashMap::new();
453    for key in ["license_file", "notice_file", "notes"] {
454        if let Some(value) = yaml.get(Value::String(key.to_string()))
455            && let Some(value) = yaml_value_to_string(value)
456        {
457            extra_data.insert(key.to_string(), serde_json::Value::String(value));
458        }
459    }
460    (!extra_data.is_empty()).then_some(extra_data)
461}
462
463crate::register_parser!(
464    "AboutCode .ABOUT metadata file",
465    &["**/*.ABOUT"],
466    "about",
467    "",
468    Some("https://aboutcode-toolkit.readthedocs.io/en/latest/specification.html"),
469);