Skip to main content

provenant/parsers/
about.rs

1//! Parser for AboutCode .ABOUT metadata files.
2//!
3//! Extracts package metadata from AboutCode .ABOUT YAML files which describe
4//! software components, licenses, and related information.
5//!
6//! # Supported Formats
7//! - .ABOUT (case-sensitive uppercase extension)
8//!
9//! # Key Features
10//! - YAML-based metadata parsing
11//! - Package URL (purl) parsing for type/namespace extraction
12//! - Owner party information
13//! - File reference tracking (about_resource field)
14//! - License expression extraction
15//! - Flexible field mapping (home_url/homepage_url)
16//!
17//! # Implementation Notes
18//! - Uses serde_yaml for YAML parsing
19//! - Uses packageurl crate for purl parsing
20//! - Extension is case-sensitive and must be uppercase (.ABOUT not .about)
21//! - Type can be overridden by 'type' field or extracted from 'purl' field
22//! - Graceful error handling: logs warnings and returns default on parse failure
23
24use crate::models::{DatasourceId, FileReference, PackageData, PackageType, Party};
25use log::warn;
26use packageurl::PackageUrl;
27use serde_yaml::Value;
28use std::fs;
29use std::path::Path;
30use std::str::FromStr;
31use url::Url;
32
33use super::PackageParser;
34use super::license_normalization::normalize_spdx_declared_license;
35
36const FIELD_TYPE: &str = "type";
37const FIELD_PURL: &str = "purl";
38const FIELD_PACKAGE_URL: &str = "package_url";
39const FIELD_NAMESPACE: &str = "namespace";
40const FIELD_NAME: &str = "name";
41const FIELD_VERSION: &str = "version";
42const FIELD_HOME_URL: &str = "home_url";
43const FIELD_HOMEPAGE_URL: &str = "homepage_url";
44const FIELD_DOWNLOAD_URL: &str = "download_url";
45const FIELD_COPYRIGHT: &str = "copyright";
46const FIELD_LICENSE_EXPRESSION: &str = "license_expression";
47const FIELD_OWNER: &str = "owner";
48const FIELD_ABOUT_RESOURCE: &str = "about_resource";
49
50/// AboutCode .ABOUT file parser.
51///
52/// Parses AboutCode metadata files that contain package information,
53/// licensing, and file references in YAML format.
54pub struct AboutFileParser;
55
56#[derive(Clone)]
57struct InferredAboutIdentity {
58    package_type: PackageType,
59    namespace: Option<String>,
60    name: Option<String>,
61    version: Option<String>,
62}
63
64impl PackageParser for AboutFileParser {
65    const PACKAGE_TYPE: PackageType = PackageType::About;
66
67    fn extract_packages(path: &Path) -> Vec<PackageData> {
68        let yaml = match read_and_parse_yaml(path) {
69            Ok(yaml) => yaml,
70            Err(e) => {
71                warn!("Failed to read or parse .ABOUT file at {:?}: {}", path, e);
72                return vec![default_package_data()];
73            }
74        };
75
76        // Extract type and purl information
77        let about_type = yaml
78            .get(FIELD_TYPE)
79            .and_then(|v| v.as_str())
80            .map(String::from);
81
82        let about_namespace = yaml
83            .get(FIELD_NAMESPACE)
84            .and_then(|v| v.as_str())
85            .map(String::from);
86
87        let purl_string = yaml
88            .get(FIELD_PURL)
89            .and_then(|v| v.as_str())
90            .or_else(|| yaml.get(FIELD_PACKAGE_URL).and_then(|v| v.as_str()))
91            .map(String::from);
92
93        // Parse purl if present
94        let (purl_type, purl_namespace, purl_name, purl_version) =
95            if let Some(ref purl_str) = purl_string {
96                match PackageUrl::from_str(purl_str) {
97                    Ok(purl) => (
98                        Some(purl.ty().to_string()),
99                        purl.namespace().map(String::from),
100                        Some(purl.name().to_string()),
101                        purl.version().map(String::from),
102                    ),
103                    Err(e) => {
104                        warn!("Failed to parse purl '{}': {}", purl_str, e);
105                        (None, None, None, None)
106                    }
107                }
108            } else {
109                (None, None, None, None)
110            };
111
112        let inferred = infer_about_from_download_url(
113            yaml.get(FIELD_DOWNLOAD_URL).and_then(|v| v.as_str()),
114            yaml.get(FIELD_NAME)
115                .and_then(yaml_value_to_string)
116                .as_deref(),
117            yaml.get(FIELD_VERSION)
118                .and_then(yaml_value_to_string)
119                .as_deref(),
120        );
121
122        let package_type = about_type
123            .clone()
124            .or(purl_type)
125            .and_then(|s| s.parse::<crate::models::PackageType>().ok())
126            .or_else(|| inferred.as_ref().map(|identity| identity.package_type))
127            .unwrap_or(Self::PACKAGE_TYPE);
128
129        // Priority: about_namespace > purl_namespace
130        let namespace = about_namespace
131            .clone()
132            .or(purl_namespace.clone())
133            .or_else(|| {
134                inferred
135                    .as_ref()
136                    .and_then(|identity| identity.namespace.clone())
137            });
138
139        // Name and version from YAML or purl
140        let name = yaml
141            .get(FIELD_NAME)
142            .and_then(yaml_value_to_string)
143            .or(purl_name.clone())
144            .or_else(|| inferred.as_ref().and_then(|identity| identity.name.clone()));
145
146        let version = yaml
147            .get(FIELD_VERSION)
148            .and_then(yaml_value_to_string)
149            .or(purl_version.clone())
150            .or_else(|| {
151                inferred
152                    .as_ref()
153                    .and_then(|identity| identity.version.clone())
154            });
155
156        // Homepage URL (two possible field names)
157        let homepage_url = yaml
158            .get(FIELD_HOME_URL)
159            .and_then(|v| v.as_str())
160            .or_else(|| yaml.get(FIELD_HOMEPAGE_URL).and_then(|v| v.as_str()))
161            .map(String::from);
162
163        let download_url = yaml
164            .get(FIELD_DOWNLOAD_URL)
165            .and_then(|v| v.as_str())
166            .map(String::from);
167
168        let copyright = yaml
169            .get(FIELD_COPYRIGHT)
170            .and_then(|v| v.as_str())
171            .map(String::from);
172
173        let extracted_license_statement = yaml
174            .get(FIELD_LICENSE_EXPRESSION)
175            .and_then(|v| v.as_str())
176            .map(String::from);
177        let (declared_license_expression, declared_license_expression_spdx, license_detections) =
178            normalize_spdx_declared_license(extracted_license_statement.as_deref());
179
180        let vcs_url = yaml
181            .get(Value::String("vcs_url".to_string()))
182            .and_then(|v| v.as_str())
183            .map(String::from);
184
185        let extra_data = build_extra_data(&yaml);
186
187        let purl = purl_string.or_else(|| {
188            let name = yaml
189                .get(FIELD_NAME)
190                .and_then(yaml_value_to_string)
191                .or(purl_name.clone())
192                .or_else(|| inferred.as_ref().and_then(|identity| identity.name.clone()));
193            let version = yaml
194                .get(FIELD_VERSION)
195                .and_then(yaml_value_to_string)
196                .or(purl_version.clone())
197                .or_else(|| {
198                    inferred
199                        .as_ref()
200                        .and_then(|identity| identity.version.clone())
201                });
202            let namespace = about_namespace.clone().or_else(|| {
203                inferred
204                    .as_ref()
205                    .and_then(|identity| identity.namespace.clone())
206            });
207            build_about_purl(
208                package_type,
209                namespace.as_deref(),
210                name.as_deref(),
211                version.as_deref(),
212            )
213        });
214
215        // Owner party
216        let parties = extract_owner_party(&yaml);
217
218        // File references
219        let file_references = extract_file_references(&yaml);
220
221        vec![PackageData {
222            package_type: Some(package_type),
223            namespace,
224            name,
225            version,
226            qualifiers: None,
227            subpath: None,
228            primary_language: None,
229            description: None,
230            release_date: None,
231            parties,
232            keywords: Vec::new(),
233            homepage_url,
234            download_url,
235            size: None,
236            sha1: None,
237            md5: None,
238            sha256: None,
239            sha512: None,
240            bug_tracking_url: None,
241            code_view_url: None,
242            vcs_url,
243            copyright,
244            holder: None,
245            declared_license_expression,
246            declared_license_expression_spdx,
247            license_detections,
248            other_license_expression: None,
249            other_license_expression_spdx: None,
250            other_license_detections: Vec::new(),
251            extracted_license_statement,
252            notice_text: None,
253            source_packages: Vec::new(),
254            file_references,
255            is_private: false,
256            is_virtual: false,
257            extra_data,
258            dependencies: Vec::new(),
259            repository_homepage_url: None,
260            repository_download_url: None,
261            api_data_url: None,
262            datasource_id: Some(DatasourceId::AboutFile),
263            purl,
264        }]
265    }
266
267    fn is_match(path: &Path) -> bool {
268        path.extension()
269            .and_then(|ext| ext.to_str())
270            .is_some_and(|ext| ext == "ABOUT")
271    }
272}
273
274/// Reads and parses a YAML file.
275fn read_and_parse_yaml(path: &Path) -> Result<serde_yaml::Mapping, String> {
276    let content = fs::read_to_string(path).map_err(|e| format!("Failed to read file: {}", e))?;
277
278    let value: Value =
279        serde_yaml::from_str(&content).map_err(|e| format!("Failed to parse YAML: {}", e))?;
280
281    match value {
282        Value::Mapping(map) => Ok(map),
283        _ => Err("Expected YAML mapping at root".to_string()),
284    }
285}
286
287/// Converts a YAML value to a string, handling strings, numbers, and booleans.
288fn yaml_value_to_string(value: &Value) -> Option<String> {
289    match value {
290        Value::String(s) => Some(s.clone()),
291        Value::Number(n) => Some(n.to_string()),
292        Value::Bool(b) => Some(b.to_string()),
293        _ => None,
294    }
295}
296
297/// Extracts owner party information from YAML.
298fn extract_owner_party(yaml: &serde_yaml::Mapping) -> Vec<Party> {
299    let owner = yaml
300        .get(Value::String(FIELD_OWNER.to_string()))
301        .map(|v| match v {
302            Value::String(s) => s.clone(),
303            _ => {
304                // Convert non-string values to their debug representation
305                format!("{:?}", v)
306            }
307        });
308
309    if let Some(owner_name) = owner {
310        if !owner_name.is_empty() {
311            vec![Party {
312                r#type: Some("person".to_string()),
313                role: Some("owner".to_string()),
314                name: Some(owner_name),
315                email: None,
316                url: None,
317                organization: None,
318                organization_url: None,
319                timezone: None,
320            }]
321        } else {
322            Vec::new()
323        }
324    } else {
325        Vec::new()
326    }
327}
328
329/// Extracts file references from YAML.
330fn extract_file_references(yaml: &serde_yaml::Mapping) -> Vec<FileReference> {
331    let about_resource = yaml
332        .get(Value::String(FIELD_ABOUT_RESOURCE.to_string()))
333        .and_then(|v| v.as_str());
334    let license_file = yaml
335        .get(Value::String("license_file".to_string()))
336        .and_then(|v| v.as_str());
337    let notice_file = yaml
338        .get(Value::String("notice_file".to_string()))
339        .and_then(|v| v.as_str());
340
341    let mut refs = Vec::new();
342
343    if let Some(path) = about_resource {
344        refs.push(FileReference {
345            path: path.to_string(),
346            size: None,
347            sha1: None,
348            md5: None,
349            sha256: None,
350            sha512: None,
351            extra_data: None,
352        });
353    }
354
355    for path in [license_file, notice_file].into_iter().flatten() {
356        refs.push(FileReference {
357            path: path.to_string(),
358            size: None,
359            sha1: None,
360            md5: None,
361            sha256: None,
362            sha512: None,
363            extra_data: None,
364        });
365    }
366
367    refs
368}
369
370/// Returns a default (empty) PackageData structure.
371fn default_package_data() -> PackageData {
372    PackageData {
373        package_type: Some(PackageType::About),
374        datasource_id: Some(DatasourceId::AboutFile),
375        ..Default::default()
376    }
377}
378
379fn infer_about_from_download_url(
380    download_url: Option<&str>,
381    about_name: Option<&str>,
382    about_version: Option<&str>,
383) -> Option<InferredAboutIdentity> {
384    let url = Url::parse(download_url?).ok()?;
385    let host = url.host_str()?;
386
387    if matches!(host, "pypi.python.org" | "files.pythonhosted.org") {
388        let name = about_name.map(str::to_string)?;
389        let version = about_version.map(str::to_string);
390        return Some(InferredAboutIdentity {
391            package_type: PackageType::Pypi,
392            namespace: None,
393            name: Some(name),
394            version,
395        });
396    }
397
398    if matches!(host, "raw.githubusercontent.com" | "github.com") {
399        let mut segments = url.path_segments()?;
400        let owner = segments.next()?.to_string();
401        let repo = segments.next()?.to_string();
402        return Some(InferredAboutIdentity {
403            package_type: PackageType::Github,
404            namespace: Some(owner),
405            name: Some(repo),
406            version: None,
407        });
408    }
409
410    None
411}
412
413fn build_about_purl(
414    package_type: PackageType,
415    namespace: Option<&str>,
416    name: Option<&str>,
417    version: Option<&str>,
418) -> Option<String> {
419    if package_type == PackageType::About {
420        return None;
421    }
422
423    let name = name?;
424    let mut purl = PackageUrl::new(package_type.as_str(), name).ok()?;
425    if let Some(namespace) = namespace {
426        purl.with_namespace(namespace).ok()?;
427    }
428    if let Some(version) = version {
429        purl.with_version(version).ok()?;
430    }
431    Some(purl.to_string())
432}
433
434fn build_extra_data(
435    yaml: &serde_yaml::Mapping,
436) -> Option<std::collections::HashMap<String, serde_json::Value>> {
437    let mut extra_data = std::collections::HashMap::new();
438    for key in ["license_file", "notice_file", "notes"] {
439        if let Some(value) = yaml.get(Value::String(key.to_string()))
440            && let Some(value) = yaml_value_to_string(value)
441        {
442            extra_data.insert(key.to_string(), serde_json::Value::String(value));
443        }
444    }
445    (!extra_data.is_empty()).then_some(extra_data)
446}
447
448crate::register_parser!(
449    "AboutCode .ABOUT metadata file",
450    &["**/*.ABOUT"],
451    "about",
452    "",
453    Some("https://aboutcode-toolkit.readthedocs.io/en/latest/specification.html"),
454);