Skip to main content

provenant/parsers/
about.rs

1//! Parser for AboutCode .ABOUT metadata files.
2//!
3//! Extracts package metadata from AboutCode .ABOUT YAML files which describe
4//! software components, licenses, and related information.
5//!
6//! # Supported Formats
7//! - .ABOUT (case-sensitive uppercase extension)
8//!
9//! # Key Features
10//! - YAML-based metadata parsing
11//! - Package URL (purl) parsing for type/namespace extraction
12//! - Owner party information
13//! - File reference tracking (about_resource field)
14//! - License expression extraction
15//! - Flexible field mapping (home_url/homepage_url)
16//!
17//! # Implementation Notes
18//! - Uses yaml_serde for YAML parsing
19//! - Uses packageurl crate for purl parsing
20//! - Extension is case-sensitive and must be uppercase (.ABOUT not .about)
21//! - Type can be overridden by 'type' field or extracted from 'purl' field
22//! - Graceful error handling: logs warnings and returns default on parse failure
23
24use crate::models::{DatasourceId, FileReference, PackageData, PackageType, Party};
25use crate::parser_warn as warn;
26use crate::parsers::utils::{read_file_to_string, truncate_field};
27use packageurl::PackageUrl;
28use std::path::Path;
29use std::str::FromStr;
30use url::Url;
31use yaml_serde::Value;
32
33use super::PackageParser;
34use super::license_normalization::{
35    DeclaredLicenseMatchMetadata, build_declared_license_data, normalize_spdx_declared_license,
36    normalize_spdx_expression,
37};
38
39const FIELD_TYPE: &str = "type";
40const FIELD_PURL: &str = "purl";
41const FIELD_PACKAGE_URL: &str = "package_url";
42const FIELD_NAMESPACE: &str = "namespace";
43const FIELD_NAME: &str = "name";
44const FIELD_VERSION: &str = "version";
45const FIELD_HOME_URL: &str = "home_url";
46const FIELD_HOMEPAGE_URL: &str = "homepage_url";
47const FIELD_DOWNLOAD_URL: &str = "download_url";
48const FIELD_COPYRIGHT: &str = "copyright";
49const FIELD_LICENSE_EXPRESSION: &str = "license_expression";
50const FIELD_OWNER: &str = "owner";
51const FIELD_ABOUT_RESOURCE: &str = "about_resource";
52
53/// AboutCode .ABOUT file parser.
54///
55/// Parses AboutCode metadata files that contain package information,
56/// licensing, and file references in YAML format.
57pub struct AboutFileParser;
58
59#[derive(Clone)]
60struct InferredAboutIdentity {
61    package_type: PackageType,
62    namespace: Option<String>,
63    name: Option<String>,
64    version: Option<String>,
65}
66
67impl PackageParser for AboutFileParser {
68    const PACKAGE_TYPE: PackageType = PackageType::About;
69
70    fn extract_packages(path: &Path) -> Vec<PackageData> {
71        let yaml = match read_and_parse_yaml(path) {
72            Ok(yaml) => yaml,
73            Err(e) => {
74                warn!("Failed to read or parse .ABOUT file at {:?}: {}", path, e);
75                return vec![default_package_data()];
76            }
77        };
78
79        // Extract type and purl information
80        let about_type = yaml
81            .get(FIELD_TYPE)
82            .and_then(|v| v.as_str())
83            .map(String::from);
84
85        let about_namespace = yaml
86            .get(FIELD_NAMESPACE)
87            .and_then(|v| v.as_str())
88            .map(|v| truncate_field(v.to_string()));
89
90        let purl_string = yaml
91            .get(FIELD_PURL)
92            .and_then(|v| v.as_str())
93            .or_else(|| yaml.get(FIELD_PACKAGE_URL).and_then(|v| v.as_str()))
94            .map(|v| truncate_field(v.to_string()));
95
96        // Parse purl if present
97        let (purl_type, purl_namespace, purl_name, purl_version) =
98            if let Some(ref purl_str) = purl_string {
99                match PackageUrl::from_str(purl_str) {
100                    Ok(purl) => (
101                        Some(truncate_field(purl.ty().to_string())),
102                        purl.namespace().map(|v| truncate_field(v.to_string())),
103                        Some(truncate_field(purl.name().to_string())),
104                        purl.version().map(|v| truncate_field(v.to_string())),
105                    ),
106                    Err(e) => {
107                        warn!("Failed to parse purl '{}': {}", purl_str, e);
108                        (None, None, None, None)
109                    }
110                }
111            } else {
112                (None, None, None, None)
113            };
114
115        let inferred = infer_about_from_download_url(
116            yaml.get(FIELD_DOWNLOAD_URL).and_then(|v| v.as_str()),
117            yaml.get(FIELD_NAME)
118                .and_then(yaml_value_to_string)
119                .as_deref(),
120            yaml.get(FIELD_VERSION)
121                .and_then(yaml_value_to_string)
122                .as_deref(),
123        );
124
125        let package_type = about_type
126            .clone()
127            .or(purl_type)
128            .and_then(|s| s.parse::<crate::models::PackageType>().ok())
129            .or_else(|| inferred.as_ref().map(|identity| identity.package_type))
130            .unwrap_or(Self::PACKAGE_TYPE);
131
132        // Priority: about_namespace > purl_namespace
133        let namespace = about_namespace
134            .clone()
135            .or(purl_namespace.clone())
136            .or_else(|| {
137                inferred
138                    .as_ref()
139                    .and_then(|identity| identity.namespace.clone())
140            })
141            .map(truncate_field);
142
143        // Name and version from YAML or purl
144        let name = yaml
145            .get(FIELD_NAME)
146            .and_then(yaml_value_to_string)
147            .or(purl_name.clone())
148            .or_else(|| inferred.as_ref().and_then(|identity| identity.name.clone()))
149            .map(truncate_field);
150
151        let version = yaml
152            .get(FIELD_VERSION)
153            .and_then(yaml_value_to_string)
154            .or(purl_version.clone())
155            .or_else(|| {
156                inferred
157                    .as_ref()
158                    .and_then(|identity| identity.version.clone())
159            })
160            .map(truncate_field);
161
162        // Homepage URL (two possible field names)
163        let homepage_url = yaml
164            .get(FIELD_HOME_URL)
165            .and_then(|v| v.as_str())
166            .or_else(|| yaml.get(FIELD_HOMEPAGE_URL).and_then(|v| v.as_str()))
167            .map(|v| truncate_field(v.to_string()));
168
169        let download_url = yaml
170            .get(FIELD_DOWNLOAD_URL)
171            .and_then(|v| v.as_str())
172            .map(|v| truncate_field(v.to_string()));
173
174        let copyright = yaml
175            .get(FIELD_COPYRIGHT)
176            .and_then(|v| v.as_str())
177            .map(|v| truncate_field(v.to_string()));
178
179        let extracted_license_statement = yaml
180            .get(FIELD_LICENSE_EXPRESSION)
181            .and_then(|v| v.as_str())
182            .map(|v| truncate_field(v.to_string()));
183        let file_references = extract_file_references(&yaml);
184        let (declared_license_expression, declared_license_expression_spdx, license_detections) =
185            extracted_license_statement
186                .as_deref()
187                .and_then(normalize_spdx_expression)
188                .map(|normalized| {
189                    build_declared_license_data(
190                        normalized,
191                        DeclaredLicenseMatchMetadata::single_line(
192                            extracted_license_statement.as_deref().unwrap_or_default(),
193                        ),
194                    )
195                })
196                .unwrap_or_else(|| {
197                    normalize_spdx_declared_license(extracted_license_statement.as_deref())
198                });
199
200        let vcs_url = yaml
201            .get(Value::String("vcs_url".to_string()))
202            .and_then(|v| v.as_str())
203            .map(|v| truncate_field(v.to_string()));
204
205        let extra_data = build_extra_data(&yaml);
206
207        let purl = purl_string
208            .or_else(|| {
209                let name = yaml
210                    .get(FIELD_NAME)
211                    .and_then(yaml_value_to_string)
212                    .or(purl_name.clone())
213                    .or_else(|| inferred.as_ref().and_then(|identity| identity.name.clone()));
214                let version = yaml
215                    .get(FIELD_VERSION)
216                    .and_then(yaml_value_to_string)
217                    .or(purl_version.clone())
218                    .or_else(|| {
219                        inferred
220                            .as_ref()
221                            .and_then(|identity| identity.version.clone())
222                    });
223                let namespace = about_namespace.clone().or_else(|| {
224                    inferred
225                        .as_ref()
226                        .and_then(|identity| identity.namespace.clone())
227                });
228                build_about_purl(
229                    package_type,
230                    namespace.as_deref(),
231                    name.as_deref(),
232                    version.as_deref(),
233                )
234            })
235            .map(truncate_field);
236
237        // Owner party
238        let parties = extract_owner_party(&yaml);
239
240        // File references
241        vec![PackageData {
242            package_type: Some(package_type),
243            namespace,
244            name,
245            version,
246            qualifiers: None,
247            subpath: None,
248            primary_language: None,
249            description: None,
250            release_date: None,
251            parties,
252            keywords: Vec::new(),
253            homepage_url,
254            download_url,
255            size: None,
256            sha1: None,
257            md5: None,
258            sha256: None,
259            sha512: None,
260            bug_tracking_url: None,
261            code_view_url: None,
262            vcs_url,
263            copyright,
264            holder: None,
265            declared_license_expression,
266            declared_license_expression_spdx,
267            license_detections,
268            other_license_expression: None,
269            other_license_expression_spdx: None,
270            other_license_detections: Vec::new(),
271            extracted_license_statement,
272            notice_text: None,
273            source_packages: Vec::new(),
274            file_references,
275            is_private: false,
276            is_virtual: false,
277            extra_data,
278            dependencies: Vec::new(),
279            repository_homepage_url: None,
280            repository_download_url: None,
281            api_data_url: None,
282            datasource_id: Some(DatasourceId::AboutFile),
283            purl,
284        }]
285    }
286
287    fn is_match(path: &Path) -> bool {
288        path.extension()
289            .and_then(|ext| ext.to_str())
290            .is_some_and(|ext| ext == "ABOUT")
291    }
292}
293
294/// Reads and parses a YAML file.
295fn read_and_parse_yaml(path: &Path) -> Result<yaml_serde::Mapping, String> {
296    let content =
297        read_file_to_string(path, None).map_err(|e| format!("Failed to read file: {}", e))?;
298
299    let value: Value =
300        yaml_serde::from_str(&content).map_err(|e| format!("Failed to parse YAML: {}", e))?;
301
302    match value {
303        Value::Mapping(map) => Ok(map),
304        _ => Err("Expected YAML mapping at root".to_string()),
305    }
306}
307
308/// Converts a YAML value to a string, handling strings, numbers, and booleans.
309fn yaml_value_to_string(value: &Value) -> Option<String> {
310    match value {
311        Value::String(s) => Some(s.clone()),
312        Value::Number(n) => Some(n.to_string()),
313        Value::Bool(b) => Some(b.to_string()),
314        _ => None,
315    }
316}
317
318/// Extracts owner party information from YAML.
319fn extract_owner_party(yaml: &yaml_serde::Mapping) -> Vec<Party> {
320    let owner = yaml
321        .get(Value::String(FIELD_OWNER.to_string()))
322        .map(|v| match v {
323            Value::String(s) => truncate_field(s.clone()),
324            _ => truncate_field(format!("{:?}", v)),
325        });
326
327    if let Some(owner_name) = owner {
328        if !owner_name.is_empty() {
329            vec![Party {
330                r#type: Some("person".to_string()),
331                role: Some("owner".to_string()),
332                name: Some(owner_name),
333                email: None,
334                url: None,
335                organization: None,
336                organization_url: None,
337                timezone: None,
338            }]
339        } else {
340            Vec::new()
341        }
342    } else {
343        Vec::new()
344    }
345}
346
347/// Extracts file references from YAML.
348fn extract_file_references(yaml: &yaml_serde::Mapping) -> Vec<FileReference> {
349    let about_resource = yaml
350        .get(Value::String(FIELD_ABOUT_RESOURCE.to_string()))
351        .and_then(|v| v.as_str());
352    let license_file = yaml
353        .get(Value::String("license_file".to_string()))
354        .and_then(|v| v.as_str());
355    let notice_file = yaml
356        .get(Value::String("notice_file".to_string()))
357        .and_then(|v| v.as_str());
358
359    let mut refs = Vec::new();
360
361    if let Some(path) = about_resource {
362        refs.push(FileReference {
363            path: truncate_field(path.to_string()),
364            size: None,
365            sha1: None,
366            md5: None,
367            sha256: None,
368            sha512: None,
369            extra_data: None,
370        });
371    }
372
373    for path in [license_file, notice_file].into_iter().flatten() {
374        refs.push(FileReference {
375            path: truncate_field(path.to_string()),
376            size: None,
377            sha1: None,
378            md5: None,
379            sha256: None,
380            sha512: None,
381            extra_data: None,
382        });
383    }
384
385    refs
386}
387
388/// Returns a default (empty) PackageData structure.
389fn default_package_data() -> PackageData {
390    PackageData {
391        package_type: Some(PackageType::About),
392        datasource_id: Some(DatasourceId::AboutFile),
393        ..Default::default()
394    }
395}
396
397fn infer_about_from_download_url(
398    download_url: Option<&str>,
399    about_name: Option<&str>,
400    about_version: Option<&str>,
401) -> Option<InferredAboutIdentity> {
402    let url = Url::parse(download_url?).ok()?;
403    let host = url.host_str()?;
404
405    if matches!(host, "pypi.python.org" | "files.pythonhosted.org") {
406        let name = about_name.map(str::to_string)?;
407        let version = about_version.map(str::to_string);
408        return Some(InferredAboutIdentity {
409            package_type: PackageType::Pypi,
410            namespace: None,
411            name: Some(name),
412            version,
413        });
414    }
415
416    if matches!(host, "raw.githubusercontent.com" | "github.com") {
417        let mut segments = url.path_segments()?;
418        let owner = segments.next()?.to_string();
419        let repo = segments.next()?.to_string();
420        return Some(InferredAboutIdentity {
421            package_type: PackageType::Github,
422            namespace: Some(owner),
423            name: Some(repo),
424            version: None,
425        });
426    }
427
428    None
429}
430
431fn build_about_purl(
432    package_type: PackageType,
433    namespace: Option<&str>,
434    name: Option<&str>,
435    version: Option<&str>,
436) -> Option<String> {
437    if package_type == PackageType::About {
438        return None;
439    }
440
441    let name = name?;
442    let mut purl = PackageUrl::new(package_type.as_str(), name).ok()?;
443    if let Some(namespace) = namespace {
444        purl.with_namespace(namespace).ok()?;
445    }
446    if let Some(version) = version {
447        purl.with_version(version).ok()?;
448    }
449    Some(purl.to_string())
450}
451
452fn build_extra_data(
453    yaml: &yaml_serde::Mapping,
454) -> Option<std::collections::HashMap<String, serde_json::Value>> {
455    let mut extra_data = std::collections::HashMap::new();
456    for key in ["license_file", "notice_file", "notes"] {
457        if let Some(value) = yaml.get(Value::String(key.to_string()))
458            && let Some(value) = yaml_value_to_string(value)
459        {
460            extra_data.insert(
461                key.to_string(),
462                serde_json::Value::String(truncate_field(value)),
463            );
464        }
465    }
466    (!extra_data.is_empty()).then_some(extra_data)
467}
468
469crate::register_parser!(
470    "AboutCode .ABOUT metadata file",
471    &["**/*.ABOUT"],
472    "about",
473    "",
474    Some("https://aboutcode-toolkit.readthedocs.io/en/latest/specification.html"),
475);