Skip to main content

provenant/parsers/
about.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Parser for AboutCode .ABOUT metadata files.
5//!
6//! Extracts package metadata from AboutCode .ABOUT YAML files which describe
7//! software components, licenses, and related information.
8//!
9//! # Supported Formats
10//! - .ABOUT (case-sensitive uppercase extension)
11//!
12//! # Key Features
13//! - YAML-based metadata parsing
14//! - Package URL (purl) parsing for type/namespace extraction
15//! - Owner party information
16//! - File reference tracking (about_resource field)
17//! - License expression extraction
18//! - Flexible field mapping (home_url/homepage_url)
19//!
20//! # Implementation Notes
21//! - Uses yaml_serde for YAML parsing
22//! - Uses packageurl crate for purl parsing
23//! - Extension is case-sensitive and must be uppercase (.ABOUT not .about)
24//! - Type can be overridden by 'type' field or extracted from 'purl' field
25//! - Graceful error handling: logs warnings and returns default on parse failure
26
27use crate::models::{DatasourceId, FileReference, PackageData, PackageType, Party};
28use crate::parser_warn as warn;
29use crate::parsers::utils::{read_file_to_string, truncate_field};
30use packageurl::PackageUrl;
31use std::path::Path;
32use std::str::FromStr;
33use url::Url;
34use yaml_serde::Value;
35
36use super::PackageParser;
37use super::license_normalization::{
38    DeclaredLicenseMatchMetadata, build_declared_license_data, normalize_spdx_declared_license,
39    normalize_spdx_expression,
40};
41
42const FIELD_TYPE: &str = "type";
43const FIELD_PURL: &str = "purl";
44const FIELD_PACKAGE_URL: &str = "package_url";
45const FIELD_NAMESPACE: &str = "namespace";
46const FIELD_NAME: &str = "name";
47const FIELD_VERSION: &str = "version";
48const FIELD_HOME_URL: &str = "home_url";
49const FIELD_HOMEPAGE_URL: &str = "homepage_url";
50const FIELD_DOWNLOAD_URL: &str = "download_url";
51const FIELD_COPYRIGHT: &str = "copyright";
52const FIELD_LICENSE_EXPRESSION: &str = "license_expression";
53const FIELD_OWNER: &str = "owner";
54const FIELD_ABOUT_RESOURCE: &str = "about_resource";
55
56/// AboutCode .ABOUT file parser.
57///
58/// Parses AboutCode metadata files that contain package information,
59/// licensing, and file references in YAML format.
60pub struct AboutFileParser;
61
62#[derive(Clone)]
63struct InferredAboutIdentity {
64    package_type: PackageType,
65    namespace: Option<String>,
66    name: Option<String>,
67    version: Option<String>,
68}
69
70impl PackageParser for AboutFileParser {
71    const PACKAGE_TYPE: PackageType = PackageType::About;
72
73    fn extract_packages(path: &Path) -> Vec<PackageData> {
74        let yaml = match read_and_parse_yaml(path) {
75            Ok(yaml) => yaml,
76            Err(e) => {
77                warn!("Failed to read or parse .ABOUT file at {:?}: {}", path, e);
78                return vec![default_package_data()];
79            }
80        };
81
82        // Extract type and purl information
83        let about_type = yaml
84            .get(FIELD_TYPE)
85            .and_then(|v| v.as_str())
86            .map(String::from);
87
88        let about_namespace = yaml
89            .get(FIELD_NAMESPACE)
90            .and_then(|v| v.as_str())
91            .map(|v| truncate_field(v.to_string()));
92
93        let purl_string = yaml
94            .get(FIELD_PURL)
95            .and_then(|v| v.as_str())
96            .or_else(|| yaml.get(FIELD_PACKAGE_URL).and_then(|v| v.as_str()))
97            .map(|v| truncate_field(v.to_string()));
98
99        // Parse purl if present
100        let (purl_type, purl_namespace, purl_name, purl_version) =
101            if let Some(ref purl_str) = purl_string {
102                match PackageUrl::from_str(purl_str) {
103                    Ok(purl) => (
104                        Some(truncate_field(purl.ty().to_string())),
105                        purl.namespace().map(|v| truncate_field(v.to_string())),
106                        Some(truncate_field(purl.name().to_string())),
107                        purl.version().map(|v| truncate_field(v.to_string())),
108                    ),
109                    Err(e) => {
110                        warn!("Failed to parse purl '{}': {}", purl_str, e);
111                        (None, None, None, None)
112                    }
113                }
114            } else {
115                (None, None, None, None)
116            };
117
118        let inferred = infer_about_from_download_url(
119            yaml.get(FIELD_DOWNLOAD_URL).and_then(|v| v.as_str()),
120            yaml.get(FIELD_NAME)
121                .and_then(yaml_value_to_string)
122                .as_deref(),
123            yaml.get(FIELD_VERSION)
124                .and_then(yaml_value_to_string)
125                .as_deref(),
126        );
127
128        let explicit_package_type = about_type
129            .clone()
130            .and_then(|s| s.parse::<crate::models::PackageType>().ok());
131        let parsed_purl_type = purl_type
132            .clone()
133            .and_then(|s| s.parse::<crate::models::PackageType>().ok());
134        let has_parsed_purl_identity = parsed_purl_type.is_some()
135            || purl_namespace.is_some()
136            || purl_name.is_some()
137            || purl_version.is_some();
138        let inferred_identity = if explicit_package_type.is_none() && !has_parsed_purl_identity {
139            inferred
140        } else {
141            None
142        };
143
144        let package_type = explicit_package_type
145            .or(parsed_purl_type)
146            .or_else(|| {
147                inferred_identity
148                    .as_ref()
149                    .map(|identity| identity.package_type)
150            })
151            .unwrap_or(Self::PACKAGE_TYPE);
152
153        // Priority: about_namespace > purl_namespace
154        let namespace = about_namespace
155            .clone()
156            .or(purl_namespace.clone())
157            .or_else(|| {
158                inferred_identity
159                    .as_ref()
160                    .and_then(|identity| identity.namespace.clone())
161            })
162            .map(truncate_field);
163
164        // Name and version from YAML or purl
165        let name = yaml
166            .get(FIELD_NAME)
167            .and_then(yaml_value_to_string)
168            .or(purl_name.clone())
169            .or_else(|| {
170                inferred_identity
171                    .as_ref()
172                    .and_then(|identity| identity.name.clone())
173            })
174            .map(truncate_field);
175
176        let version = yaml
177            .get(FIELD_VERSION)
178            .and_then(yaml_value_to_string)
179            .or(purl_version.clone())
180            .or_else(|| {
181                inferred_identity
182                    .as_ref()
183                    .and_then(|identity| identity.version.clone())
184            })
185            .map(truncate_field);
186
187        // Homepage URL (two possible field names)
188        let homepage_url = yaml
189            .get(FIELD_HOME_URL)
190            .and_then(|v| v.as_str())
191            .or_else(|| yaml.get(FIELD_HOMEPAGE_URL).and_then(|v| v.as_str()))
192            .map(|v| truncate_field(v.to_string()));
193
194        let download_url = yaml
195            .get(FIELD_DOWNLOAD_URL)
196            .and_then(|v| v.as_str())
197            .map(|v| truncate_field(v.to_string()));
198
199        let copyright = yaml
200            .get(FIELD_COPYRIGHT)
201            .and_then(|v| v.as_str())
202            .map(|v| truncate_field(v.to_string()));
203
204        let extracted_license_statement = yaml
205            .get(FIELD_LICENSE_EXPRESSION)
206            .and_then(|v| v.as_str())
207            .map(|v| truncate_field(v.to_string()));
208        let file_references = extract_file_references(&yaml);
209        let (declared_license_expression, declared_license_expression_spdx, license_detections) =
210            extracted_license_statement
211                .as_deref()
212                .and_then(normalize_spdx_expression)
213                .map(|normalized| {
214                    build_declared_license_data(
215                        normalized,
216                        DeclaredLicenseMatchMetadata::single_line(
217                            extracted_license_statement.as_deref().unwrap_or_default(),
218                        ),
219                    )
220                })
221                .unwrap_or_else(|| {
222                    normalize_spdx_declared_license(extracted_license_statement.as_deref())
223                });
224
225        let vcs_url = yaml
226            .get(Value::String("vcs_url".to_string()))
227            .and_then(|v| v.as_str())
228            .map(|v| truncate_field(v.to_string()));
229
230        let extra_data = build_extra_data(&yaml);
231
232        let purl = purl_string
233            .or_else(|| {
234                let name = yaml
235                    .get(FIELD_NAME)
236                    .and_then(yaml_value_to_string)
237                    .or(purl_name.clone())
238                    .or_else(|| {
239                        inferred_identity
240                            .as_ref()
241                            .and_then(|identity| identity.name.clone())
242                    });
243                let version = yaml
244                    .get(FIELD_VERSION)
245                    .and_then(yaml_value_to_string)
246                    .or(purl_version.clone())
247                    .or_else(|| {
248                        inferred_identity
249                            .as_ref()
250                            .and_then(|identity| identity.version.clone())
251                    });
252                let namespace = about_namespace.clone().or_else(|| {
253                    inferred_identity
254                        .as_ref()
255                        .and_then(|identity| identity.namespace.clone())
256                });
257                build_about_purl(
258                    package_type,
259                    namespace.as_deref(),
260                    name.as_deref(),
261                    version.as_deref(),
262                )
263            })
264            .map(truncate_field);
265
266        // Owner party
267        let parties = extract_owner_party(&yaml);
268
269        // File references
270        vec![PackageData {
271            package_type: Some(package_type),
272            namespace,
273            name,
274            version,
275            qualifiers: None,
276            subpath: None,
277            primary_language: None,
278            description: None,
279            release_date: None,
280            parties,
281            keywords: Vec::new(),
282            homepage_url,
283            download_url,
284            size: None,
285            sha1: None,
286            md5: None,
287            sha256: None,
288            sha512: None,
289            bug_tracking_url: None,
290            code_view_url: None,
291            vcs_url,
292            copyright,
293            holder: None,
294            declared_license_expression,
295            declared_license_expression_spdx,
296            license_detections,
297            other_license_expression: None,
298            other_license_expression_spdx: None,
299            other_license_detections: Vec::new(),
300            extracted_license_statement,
301            notice_text: None,
302            source_packages: Vec::new(),
303            file_references,
304            is_private: false,
305            is_virtual: false,
306            extra_data,
307            dependencies: Vec::new(),
308            repository_homepage_url: None,
309            repository_download_url: None,
310            api_data_url: None,
311            datasource_id: Some(DatasourceId::AboutFile),
312            purl,
313        }]
314    }
315
316    fn is_match(path: &Path) -> bool {
317        path.extension()
318            .and_then(|ext| ext.to_str())
319            .is_some_and(|ext| ext == "ABOUT")
320    }
321
322    fn metadata() -> Vec<super::metadata::ParserMetadata> {
323        vec![super::metadata::ParserMetadata {
324            description: "AboutCode .ABOUT metadata file",
325            file_patterns: &["**/*.ABOUT"],
326            package_type: "about",
327            primary_language: "",
328            documentation_url: Some(
329                "https://aboutcode-toolkit.readthedocs.io/en/latest/specification.html",
330            ),
331        }]
332    }
333}
334
335/// Reads and parses a YAML file.
336fn read_and_parse_yaml(path: &Path) -> Result<yaml_serde::Mapping, String> {
337    let content =
338        read_file_to_string(path, None).map_err(|e| format!("Failed to read file: {}", e))?;
339
340    parse_yaml_mapping(&content)
341        .or_else(|yaml_error| parse_shallow_scalar_mapping(&content).ok_or(yaml_error))
342}
343
344fn parse_yaml_mapping(content: &str) -> Result<yaml_serde::Mapping, String> {
345    let value: Value =
346        yaml_serde::from_str(content).map_err(|e| format!("Failed to parse YAML: {}", e))?;
347
348    match value {
349        Value::Mapping(map) => Ok(map),
350        _ => Err("Expected YAML mapping at root".to_string()),
351    }
352}
353
354fn parse_shallow_scalar_mapping(content: &str) -> Option<yaml_serde::Mapping> {
355    let mut map = yaml_serde::Mapping::new();
356    let mut saw_mapping_entry = false;
357
358    for line in content.lines() {
359        let trimmed = line.trim();
360        if trimmed.is_empty() || trimmed.starts_with('#') {
361            continue;
362        }
363        if line.starts_with(char::is_whitespace) {
364            return None;
365        }
366
367        let (raw_key, raw_value) = trimmed.split_once(':')?;
368        let key = raw_key.trim();
369        if key.is_empty()
370            || !key.chars().all(|character| {
371                character.is_ascii_alphanumeric() || matches!(character, '_' | '-')
372            })
373        {
374            return None;
375        }
376
377        let value = raw_value.trim();
378        if value.is_empty() {
379            return None;
380        }
381
382        saw_mapping_entry = true;
383        map.insert(
384            Value::String(key.to_string()),
385            Value::String(unquote_yaml_scalar(value)),
386        );
387    }
388
389    saw_mapping_entry.then_some(map)
390}
391
392fn unquote_yaml_scalar(value: &str) -> String {
393    if value.len() >= 2 {
394        let mut characters = value.chars();
395        let first = characters.next();
396        let last = value.chars().last();
397        if matches!(
398            (first, last),
399            (Some('"'), Some('"')) | (Some('\''), Some('\''))
400        ) {
401            return value[1..value.len() - 1].to_string();
402        }
403    }
404    value.to_string()
405}
406
407/// Converts a YAML value to a string, handling strings, numbers, and booleans.
408fn yaml_value_to_string(value: &Value) -> Option<String> {
409    match value {
410        Value::String(s) => Some(s.clone()),
411        Value::Number(n) => Some(n.to_string()),
412        Value::Bool(b) => Some(b.to_string()),
413        _ => None,
414    }
415}
416
417/// Extracts owner party information from YAML.
418fn extract_owner_party(yaml: &yaml_serde::Mapping) -> Vec<Party> {
419    let owner = yaml
420        .get(Value::String(FIELD_OWNER.to_string()))
421        .map(|v| match v {
422            Value::String(s) => truncate_field(s.clone()),
423            _ => truncate_field(format!("{:?}", v)),
424        });
425
426    if let Some(owner_name) = owner {
427        if !owner_name.is_empty() {
428            vec![Party {
429                r#type: Some("person".to_string()),
430                role: Some("owner".to_string()),
431                name: Some(owner_name),
432                email: None,
433                url: None,
434                organization: None,
435                organization_url: None,
436                timezone: None,
437            }]
438        } else {
439            Vec::new()
440        }
441    } else {
442        Vec::new()
443    }
444}
445
446/// Extracts file references from YAML.
447fn extract_file_references(yaml: &yaml_serde::Mapping) -> Vec<FileReference> {
448    let about_resource = yaml
449        .get(Value::String(FIELD_ABOUT_RESOURCE.to_string()))
450        .and_then(|v| v.as_str());
451    let license_file = yaml
452        .get(Value::String("license_file".to_string()))
453        .and_then(|v| v.as_str());
454    let notice_file = yaml
455        .get(Value::String("notice_file".to_string()))
456        .and_then(|v| v.as_str());
457
458    let mut refs = Vec::new();
459
460    if let Some(path) = about_resource {
461        refs.push(FileReference {
462            path: truncate_field(path.to_string()),
463            size: None,
464            sha1: None,
465            md5: None,
466            sha256: None,
467            sha512: None,
468            extra_data: None,
469        });
470    }
471
472    for path in [license_file, notice_file].into_iter().flatten() {
473        refs.push(FileReference {
474            path: truncate_field(path.to_string()),
475            size: None,
476            sha1: None,
477            md5: None,
478            sha256: None,
479            sha512: None,
480            extra_data: None,
481        });
482    }
483
484    refs
485}
486
487/// Returns a default (empty) PackageData structure.
488fn default_package_data() -> PackageData {
489    PackageData {
490        package_type: Some(PackageType::About),
491        datasource_id: Some(DatasourceId::AboutFile),
492        ..Default::default()
493    }
494}
495
496fn infer_about_from_download_url(
497    download_url: Option<&str>,
498    about_name: Option<&str>,
499    about_version: Option<&str>,
500) -> Option<InferredAboutIdentity> {
501    let url = Url::parse(download_url?).ok()?;
502    let host = url.host_str()?;
503
504    if matches!(host, "pypi.python.org" | "files.pythonhosted.org") {
505        let name = about_name.map(str::to_string)?;
506        let version = about_version.map(str::to_string);
507        return Some(InferredAboutIdentity {
508            package_type: PackageType::Pypi,
509            namespace: None,
510            name: Some(name),
511            version,
512        });
513    }
514
515    if matches!(host, "raw.githubusercontent.com" | "github.com") {
516        let mut segments = url.path_segments()?;
517        let owner = segments.next()?.to_string();
518        let repo = segments.next()?.to_string();
519        return Some(InferredAboutIdentity {
520            package_type: PackageType::Github,
521            namespace: Some(owner),
522            name: Some(repo),
523            version: None,
524        });
525    }
526
527    None
528}
529
530fn build_about_purl(
531    package_type: PackageType,
532    namespace: Option<&str>,
533    name: Option<&str>,
534    version: Option<&str>,
535) -> Option<String> {
536    if package_type == PackageType::About {
537        return None;
538    }
539
540    let name = name?;
541    let mut purl = PackageUrl::new(package_type.as_str(), name).ok()?;
542    if let Some(namespace) = namespace {
543        purl.with_namespace(namespace).ok()?;
544    }
545    if let Some(version) = version {
546        purl.with_version(version).ok()?;
547    }
548    Some(purl.to_string())
549}
550
551fn build_extra_data(
552    yaml: &yaml_serde::Mapping,
553) -> Option<std::collections::HashMap<String, serde_json::Value>> {
554    let mut extra_data = std::collections::HashMap::new();
555    for key in ["license_file", "notice_file", "notes"] {
556        if let Some(value) = yaml.get(Value::String(key.to_string()))
557            && let Some(value) = yaml_value_to_string(value)
558        {
559            extra_data.insert(
560                key.to_string(),
561                serde_json::Value::String(truncate_field(value)),
562            );
563        }
564    }
565    (!extra_data.is_empty()).then_some(extra_data)
566}