Skip to main content

provenant/parsers/
cpan.rs

1//! Parser for CPAN Perl package manifests.
2//!
3//! Extracts package metadata, dependencies, and author information from
4//! CPAN distribution files used by Perl modules.
5//!
6//! # Supported Formats
7//! - META.json (CPAN::Meta::Spec v2.0+)
8//! - META.yml (CPAN::Meta::Spec v1.4)
9//! - MANIFEST (file list)
10//!
11//! # Key Features
12//! - Full metadata extraction from META.json and META.yml (beyond Python stub handlers)
13//! - Dependency extraction for all CPAN dependency scopes (runtime, build, test, configure)
14//! - Author party information extraction
15//! - Repository URL extraction
16//! - File references from MANIFEST
17//!
18//! # Implementation Notes
19//! - Uses serde_json for JSON parsing
20//! - Uses serde_yaml for YAML parsing
21//! - Python reference has stub-only handlers with no parse() method
22//! - This is a BEYOND PARITY implementation - we extract complete metadata
23
24use std::fs;
25use std::path::Path;
26
27use crate::parser_warn as warn;
28use packageurl::PackageUrl;
29use serde_json::Value as JsonValue;
30use serde_yaml::Value as YamlValue;
31
32use crate::models::{DatasourceId, Dependency, FileReference, PackageData, PackageType, Party};
33
34use super::PackageParser;
35use super::license_normalization::{
36    DeclaredLicenseMatchMetadata, NormalizedDeclaredLicense, build_declared_license_data,
37    combine_normalized_licenses, empty_declared_license_data, normalize_declared_license_key,
38    normalize_spdx_expression,
39};
40
41const FIELD_NAME: &str = "name";
42const FIELD_VERSION: &str = "version";
43const FIELD_ABSTRACT: &str = "abstract";
44const FIELD_DESCRIPTION: &str = "description";
45const FIELD_LICENSE: &str = "license";
46const FIELD_AUTHOR: &str = "author";
47const FIELD_RESOURCES: &str = "resources";
48const FIELD_PREREQS: &str = "prereqs";
49const FIELD_REQUIRES: &str = "requires";
50const FIELD_BUILD_REQUIRES: &str = "build_requires";
51const FIELD_TEST_REQUIRES: &str = "test_requires";
52const FIELD_CONFIGURE_REQUIRES: &str = "configure_requires";
53
54/// CPAN META.json parser for CPAN::Meta::Spec v2.0+ metadata.
55///
56/// Extracts complete metadata from META.json files including dependencies
57/// from all scopes (runtime, build, test, configure).
58pub struct CpanMetaJsonParser;
59
60impl PackageParser for CpanMetaJsonParser {
61    const PACKAGE_TYPE: PackageType = PackageType::Cpan;
62
63    fn is_match(path: &Path) -> bool {
64        path.file_name().is_some_and(|name| name == "META.json")
65    }
66
67    fn extract_packages(path: &Path) -> Vec<PackageData> {
68        let json = match read_and_parse_json(path) {
69            Ok(json) => json,
70            Err(e) => {
71                warn!("Failed to parse META.json at {:?}: {}", path, e);
72                return vec![default_package_data(DatasourceId::CpanMetaJson)];
73            }
74        };
75
76        let name = json
77            .get(FIELD_NAME)
78            .and_then(|v| v.as_str())
79            .map(String::from);
80
81        let version = extract_version_from_json(&json);
82
83        let description = json
84            .get(FIELD_ABSTRACT)
85            .and_then(|v| v.as_str())
86            .map(String::from);
87
88        let extracted_license_statement = extract_license_from_json(&json);
89        let (declared_license_expression, declared_license_expression_spdx, license_detections) =
90            normalize_cpan_declared_license(
91                json.get(FIELD_LICENSE),
92                extracted_license_statement.as_deref(),
93            );
94        let parties = extract_parties_from_json(&json);
95        let dependencies = extract_dependencies_from_json(&json);
96        let (homepage_url, vcs_url, code_view_url, bug_tracking_url) =
97            extract_resources_from_json(&json);
98
99        vec![PackageData {
100            package_type: Some(Self::PACKAGE_TYPE),
101            name,
102            version,
103            description,
104            declared_license_expression,
105            declared_license_expression_spdx,
106            license_detections,
107            extracted_license_statement,
108            parties,
109            dependencies,
110            homepage_url,
111            vcs_url,
112            code_view_url,
113            bug_tracking_url,
114            primary_language: Some("Perl".to_string()),
115            datasource_id: Some(DatasourceId::CpanMetaJson),
116            ..Default::default()
117        }]
118    }
119}
120
121/// CPAN META.yml parser for CPAN::Meta::Spec v1.4 metadata.
122///
123/// Extracts complete metadata from META.yml files with legacy dependency structure.
124pub struct CpanMetaYmlParser;
125
126impl PackageParser for CpanMetaYmlParser {
127    const PACKAGE_TYPE: PackageType = PackageType::Cpan;
128
129    fn is_match(path: &Path) -> bool {
130        path.file_name().is_some_and(|name| name == "META.yml")
131    }
132
133    fn extract_packages(path: &Path) -> Vec<PackageData> {
134        let yaml = match read_and_parse_yaml(path) {
135            Ok(yaml) => yaml,
136            Err(e) => {
137                warn!("Failed to parse META.yml at {:?}: {}", path, e);
138                return vec![default_package_data(DatasourceId::CpanMetaYml)];
139            }
140        };
141
142        let name = yaml
143            .get(FIELD_NAME)
144            .and_then(|v| v.as_str())
145            .map(String::from);
146
147        let version = extract_version_from_yaml(&yaml);
148
149        let description = yaml
150            .get(FIELD_ABSTRACT)
151            .or_else(|| yaml.get(FIELD_DESCRIPTION))
152            .and_then(|v| v.as_str())
153            .map(String::from);
154
155        let extracted_license_statement = extract_license_from_yaml(&yaml);
156        let (declared_license_expression, declared_license_expression_spdx, license_detections) =
157            normalize_cpan_declared_license(
158                yaml.get(YamlValue::String(FIELD_LICENSE.to_string())),
159                extracted_license_statement.as_deref(),
160            );
161        let parties = extract_parties_from_yaml(&yaml);
162        let dependencies = extract_dependencies_from_yaml(&yaml);
163        let (homepage_url, vcs_url, bug_tracking_url) = extract_resources_from_yaml(&yaml);
164
165        vec![PackageData {
166            package_type: Some(Self::PACKAGE_TYPE),
167            name,
168            version,
169            description,
170            declared_license_expression,
171            declared_license_expression_spdx,
172            license_detections,
173            extracted_license_statement,
174            parties,
175            dependencies,
176            homepage_url,
177            vcs_url,
178            bug_tracking_url,
179            primary_language: Some("Perl".to_string()),
180            datasource_id: Some(DatasourceId::CpanMetaYml),
181            ..Default::default()
182        }]
183    }
184}
185
186/// CPAN MANIFEST parser for module file lists.
187///
188/// Extracts file references from MANIFEST files (simple line-by-line format).
189pub struct CpanManifestParser;
190
191impl PackageParser for CpanManifestParser {
192    const PACKAGE_TYPE: PackageType = PackageType::Cpan;
193
194    fn is_match(path: &Path) -> bool {
195        path.file_name().is_some_and(|name| name == "MANIFEST")
196    }
197
198    fn extract_packages(path: &Path) -> Vec<PackageData> {
199        let content = match fs::read_to_string(path) {
200            Ok(content) => content,
201            Err(e) => {
202                warn!("Failed to read MANIFEST at {:?}: {}", path, e);
203                return vec![default_package_data(DatasourceId::CpanManifest)];
204            }
205        };
206
207        let file_references = content
208            .lines()
209            .filter(|line| !line.trim().is_empty())
210            .filter(|line| !line.trim().starts_with('#'))
211            .map(|line| {
212                // MANIFEST can have comments after whitespace
213                let path = line.split_whitespace().next().unwrap_or(line);
214                FileReference {
215                    path: path.to_string(),
216                    size: None,
217                    sha1: None,
218                    md5: None,
219                    sha256: None,
220                    sha512: None,
221                    extra_data: None,
222                }
223            })
224            .collect();
225
226        vec![PackageData {
227            package_type: Some(Self::PACKAGE_TYPE),
228            file_references,
229            primary_language: Some("Perl".to_string()),
230            datasource_id: Some(DatasourceId::CpanManifest),
231            ..Default::default()
232        }]
233    }
234}
235
236fn default_package_data(datasource_id: DatasourceId) -> PackageData {
237    PackageData {
238        package_type: Some(CpanMetaJsonParser::PACKAGE_TYPE),
239        primary_language: Some("Perl".to_string()),
240        datasource_id: Some(datasource_id),
241        ..Default::default()
242    }
243}
244
245fn read_and_parse_json(path: &Path) -> Result<serde_json::Map<String, JsonValue>, String> {
246    let content = fs::read_to_string(path).map_err(|e| format!("Failed to read file: {}", e))?;
247    let json: JsonValue =
248        serde_json::from_str(&content).map_err(|e| format!("Failed to parse JSON: {}", e))?;
249    json.as_object()
250        .cloned()
251        .ok_or_else(|| "Root JSON is not an object".to_string())
252}
253
254fn read_and_parse_yaml(path: &Path) -> Result<serde_yaml::Mapping, String> {
255    let content = fs::read_to_string(path).map_err(|e| format!("Failed to read file: {}", e))?;
256    let yaml: YamlValue =
257        serde_yaml::from_str(&content).map_err(|e| format!("Failed to parse YAML: {}", e))?;
258    yaml.as_mapping()
259        .cloned()
260        .ok_or_else(|| "Root YAML is not a mapping".to_string())
261}
262
263fn extract_version_from_json(json: &serde_json::Map<String, JsonValue>) -> Option<String> {
264    json.get(FIELD_VERSION).and_then(|v| match v {
265        JsonValue::String(s) => Some(s.clone()),
266        JsonValue::Number(n) => Some(n.to_string()),
267        _ => None,
268    })
269}
270
271fn extract_version_from_yaml(yaml: &serde_yaml::Mapping) -> Option<String> {
272    yaml.get(YamlValue::String(FIELD_VERSION.to_string()))
273        .and_then(|v| match v {
274            YamlValue::String(s) => Some(s.clone()),
275            YamlValue::Number(n) => Some(n.to_string()),
276            _ => None,
277        })
278}
279
280fn extract_license_from_json(json: &serde_json::Map<String, JsonValue>) -> Option<String> {
281    json.get(FIELD_LICENSE).and_then(|v| match v {
282        JsonValue::String(s) => Some(s.clone()),
283        JsonValue::Array(arr) => {
284            let licenses: Vec<String> = arr
285                .iter()
286                .filter_map(|item| item.as_str().map(String::from))
287                .collect();
288            if licenses.is_empty() {
289                None
290            } else {
291                Some(licenses.join(" AND "))
292            }
293        }
294        _ => None,
295    })
296}
297
298fn extract_license_from_yaml(yaml: &serde_yaml::Mapping) -> Option<String> {
299    yaml.get(YamlValue::String(FIELD_LICENSE.to_string()))
300        .and_then(|v| match v {
301            YamlValue::String(s) => Some(s.clone()),
302            YamlValue::Sequence(arr) => {
303                let licenses: Vec<String> = arr
304                    .iter()
305                    .filter_map(|item| item.as_str().map(String::from))
306                    .collect();
307                if licenses.is_empty() {
308                    None
309                } else {
310                    Some(licenses.join(" AND "))
311                }
312            }
313            _ => None,
314        })
315}
316
317fn normalize_cpan_declared_license(
318    raw_license: Option<&impl LicenseValueAdapter>,
319    extracted_license_statement: Option<&str>,
320) -> (
321    Option<String>,
322    Option<String>,
323    Vec<crate::models::LicenseDetection>,
324) {
325    let Some(raw_license) = raw_license else {
326        return empty_declared_license_data();
327    };
328    let normalized = raw_license
329        .license_values()
330        .into_iter()
331        .map(|value| normalize_cpan_license_value(&value))
332        .collect::<Option<Vec<_>>>();
333
334    if let Some(normalized) = normalized
335        && let Some(combined) = combine_normalized_licenses(normalized, " AND ")
336    {
337        return build_declared_license_data(
338            combined,
339            DeclaredLicenseMatchMetadata::single_line(
340                extracted_license_statement.unwrap_or_default(),
341            ),
342        );
343    }
344
345    empty_declared_license_data()
346}
347
348trait LicenseValueAdapter {
349    fn license_values(&self) -> Vec<String>;
350}
351
352impl LicenseValueAdapter for JsonValue {
353    fn license_values(&self) -> Vec<String> {
354        match self {
355            JsonValue::String(value) => vec![value.trim().to_string()],
356            JsonValue::Array(values) => values
357                .iter()
358                .filter_map(|value| value.as_str())
359                .map(str::trim)
360                .filter(|value| !value.is_empty())
361                .map(ToOwned::to_owned)
362                .collect(),
363            _ => Vec::new(),
364        }
365    }
366}
367
368impl LicenseValueAdapter for YamlValue {
369    fn license_values(&self) -> Vec<String> {
370        match self {
371            YamlValue::String(value) => vec![value.trim().to_string()],
372            YamlValue::Sequence(values) => values
373                .iter()
374                .filter_map(|value| value.as_str())
375                .map(str::trim)
376                .filter(|value| !value.is_empty())
377                .map(ToOwned::to_owned)
378                .collect(),
379            _ => Vec::new(),
380        }
381    }
382}
383
384fn normalize_cpan_license_value(value: &str) -> Option<NormalizedDeclaredLicense> {
385    match value.trim() {
386        "perl_5" | "Perl_5" => Some(NormalizedDeclaredLicense::new(
387            "gpl-1.0-plus OR artistic-perl-1.0",
388            "GPL-1.0-or-later OR Artistic-1.0-Perl",
389        )),
390        "artistic_2" => Some(NormalizedDeclaredLicense::new(
391            "artistic-2.0",
392            "Artistic-2.0",
393        )),
394        "apache_2_0" => Some(NormalizedDeclaredLicense::new("apache-2.0", "Apache-2.0")),
395        other => normalize_spdx_expression(other).or_else(|| normalize_declared_license_key(other)),
396    }
397}
398
399fn extract_parties_from_json(json: &serde_json::Map<String, JsonValue>) -> Vec<Party> {
400    json.get(FIELD_AUTHOR)
401        .and_then(|v| v.as_array())
402        .map_or_else(Vec::new, |authors| {
403            authors
404                .iter()
405                .filter_map(|author| {
406                    author.as_str().map(|s| {
407                        let (name, email) = parse_author_string(s);
408                        Party {
409                            r#type: Some("person".to_string()),
410                            role: Some("author".to_string()),
411                            name,
412                            email,
413                            url: None,
414                            organization: None,
415                            organization_url: None,
416                            timezone: None,
417                        }
418                    })
419                })
420                .collect()
421        })
422}
423
424fn extract_parties_from_yaml(yaml: &serde_yaml::Mapping) -> Vec<Party> {
425    yaml.get(YamlValue::String(FIELD_AUTHOR.to_string()))
426        .and_then(|v| v.as_sequence())
427        .map_or_else(Vec::new, |authors| {
428            authors
429                .iter()
430                .filter_map(|author| {
431                    author.as_str().map(|s| {
432                        let (name, email) = parse_author_string(s);
433                        Party {
434                            r#type: Some("person".to_string()),
435                            role: Some("author".to_string()),
436                            name,
437                            email,
438                            url: None,
439                            organization: None,
440                            organization_url: None,
441                            timezone: None,
442                        }
443                    })
444                })
445                .collect()
446        })
447}
448
449fn parse_author_string(author_str: &str) -> (Option<String>, Option<String>) {
450    // Parse "Name <email@example.com>" format
451    if let Some(email_start) = author_str.find('<')
452        && let Some(email_end) = author_str.find('>')
453        && email_start < email_end
454    {
455        let name = author_str[..email_start].trim();
456        let email = author_str[email_start + 1..email_end].trim();
457        return (
458            if name.is_empty() {
459                None
460            } else {
461                Some(name.to_string())
462            },
463            if email.is_empty() {
464                None
465            } else {
466                Some(email.to_string())
467            },
468        );
469    }
470    // No email found, treat entire string as name
471    (Some(author_str.trim().to_string()), None)
472}
473
474fn extract_resources_from_json(
475    json: &serde_json::Map<String, JsonValue>,
476) -> (
477    Option<String>,
478    Option<String>,
479    Option<String>,
480    Option<String>,
481) {
482    let resources = match json.get(FIELD_RESOURCES).and_then(|v| v.as_object()) {
483        Some(r) => r,
484        None => return (None, None, None, None),
485    };
486
487    let homepage_url = resources
488        .get("homepage")
489        .and_then(|v| v.as_str())
490        .map(String::from);
491
492    let vcs_url = resources.get("repository").and_then(|v| match v {
493        JsonValue::String(s) => Some(s.clone()),
494        JsonValue::Object(obj) => obj.get("url").and_then(|u| u.as_str()).map(String::from),
495        _ => None,
496    });
497
498    let code_view_url = resources
499        .get("repository")
500        .and_then(|v| v.as_object())
501        .and_then(|obj| obj.get("web").and_then(|u| u.as_str()).map(String::from));
502
503    let bug_tracking_url = resources.get("bugtracker").and_then(|v| match v {
504        JsonValue::String(s) => Some(s.clone()),
505        JsonValue::Object(obj) => obj.get("web").and_then(|u| u.as_str()).map(String::from),
506        _ => None,
507    });
508
509    (homepage_url, vcs_url, code_view_url, bug_tracking_url)
510}
511
512fn extract_resources_from_yaml(
513    yaml: &serde_yaml::Mapping,
514) -> (Option<String>, Option<String>, Option<String>) {
515    let resources = match yaml
516        .get(YamlValue::String(FIELD_RESOURCES.to_string()))
517        .and_then(|v| v.as_mapping())
518    {
519        Some(r) => r,
520        None => return (None, None, None),
521    };
522
523    let homepage_url = resources
524        .get(YamlValue::String("homepage".to_string()))
525        .and_then(|v| v.as_str())
526        .map(String::from);
527
528    let vcs_url = resources
529        .get(YamlValue::String("repository".to_string()))
530        .and_then(|v| v.as_str())
531        .map(String::from);
532
533    let bug_tracking_url = resources
534        .get(YamlValue::String("bugtracker".to_string()))
535        .and_then(|v| v.as_str())
536        .map(String::from);
537
538    (homepage_url, vcs_url, bug_tracking_url)
539}
540
541fn extract_dependencies_from_json(json: &serde_json::Map<String, JsonValue>) -> Vec<Dependency> {
542    let mut dependencies = Vec::new();
543
544    let prereqs = match json.get(FIELD_PREREQS).and_then(|v| v.as_object()) {
545        Some(p) => p,
546        None => return dependencies,
547    };
548
549    // Extract runtime dependencies
550    if let Some(runtime) = prereqs.get("runtime").and_then(|v| v.as_object())
551        && let Some(requires) = runtime.get("requires").and_then(|v| v.as_object())
552    {
553        dependencies.extend(extract_dependency_group(requires, "runtime", true, false));
554    }
555
556    // Extract build dependencies
557    if let Some(build) = prereqs.get("build").and_then(|v| v.as_object())
558        && let Some(requires) = build.get("requires").and_then(|v| v.as_object())
559    {
560        dependencies.extend(extract_dependency_group(requires, "build", false, false));
561    }
562
563    // Extract test dependencies
564    if let Some(test) = prereqs.get("test").and_then(|v| v.as_object())
565        && let Some(requires) = test.get("requires").and_then(|v| v.as_object())
566    {
567        dependencies.extend(extract_dependency_group(requires, "test", false, false));
568    }
569
570    // Extract configure dependencies
571    if let Some(configure) = prereqs.get("configure").and_then(|v| v.as_object())
572        && let Some(requires) = configure.get("requires").and_then(|v| v.as_object())
573    {
574        dependencies.extend(extract_dependency_group(
575            requires,
576            "configure",
577            false,
578            false,
579        ));
580    }
581
582    dependencies
583}
584
585fn extract_dependencies_from_yaml(yaml: &serde_yaml::Mapping) -> Vec<Dependency> {
586    let mut dependencies = Vec::new();
587
588    // META.yml v1.4 has flat dependency structure
589    if let Some(requires) = yaml
590        .get(YamlValue::String(FIELD_REQUIRES.to_string()))
591        .and_then(|v| v.as_mapping())
592    {
593        dependencies.extend(extract_yaml_dependency_group(
594            requires, "runtime", true, false,
595        ));
596    }
597
598    if let Some(build_requires) = yaml
599        .get(YamlValue::String(FIELD_BUILD_REQUIRES.to_string()))
600        .and_then(|v| v.as_mapping())
601    {
602        dependencies.extend(extract_yaml_dependency_group(
603            build_requires,
604            "build",
605            false,
606            false,
607        ));
608    }
609
610    if let Some(test_requires) = yaml
611        .get(YamlValue::String(FIELD_TEST_REQUIRES.to_string()))
612        .and_then(|v| v.as_mapping())
613    {
614        dependencies.extend(extract_yaml_dependency_group(
615            test_requires,
616            "test",
617            false,
618            false,
619        ));
620    }
621
622    if let Some(configure_requires) = yaml
623        .get(YamlValue::String(FIELD_CONFIGURE_REQUIRES.to_string()))
624        .and_then(|v| v.as_mapping())
625    {
626        dependencies.extend(extract_yaml_dependency_group(
627            configure_requires,
628            "configure",
629            false,
630            false,
631        ));
632    }
633
634    dependencies
635}
636
637fn extract_dependency_group(
638    deps: &serde_json::Map<String, JsonValue>,
639    scope: &str,
640    is_runtime: bool,
641    is_optional: bool,
642) -> Vec<Dependency> {
643    deps.iter()
644        .filter_map(|(name, version)| {
645            // Skip perl itself as it's not a CPAN module
646            if name == "perl" {
647                return None;
648            }
649
650            let purl = PackageUrl::new("cpan", name).ok().map(|p| p.to_string());
651
652            let extracted_requirement = match version {
653                JsonValue::String(s) => Some(s.clone()),
654                JsonValue::Number(n) => Some(n.to_string()),
655                _ => None,
656            };
657
658            Some(Dependency {
659                purl,
660                extracted_requirement,
661                scope: Some(scope.to_string()),
662                is_runtime: Some(is_runtime),
663                is_optional: Some(is_optional),
664                is_pinned: None,
665                is_direct: Some(true),
666                resolved_package: None,
667                extra_data: None,
668            })
669        })
670        .collect()
671}
672
673fn extract_yaml_dependency_group(
674    deps: &serde_yaml::Mapping,
675    scope: &str,
676    is_runtime: bool,
677    is_optional: bool,
678) -> Vec<Dependency> {
679    deps.iter()
680        .filter_map(|(key, value)| {
681            let name = key.as_str()?;
682
683            // Skip perl itself as it's not a CPAN module
684            if name == "perl" {
685                return None;
686            }
687
688            let purl = PackageUrl::new("cpan", name).ok().map(|p| p.to_string());
689
690            let extracted_requirement = match value {
691                YamlValue::String(s) => Some(s.clone()),
692                YamlValue::Number(n) => Some(n.to_string()),
693                _ => None,
694            };
695
696            Some(Dependency {
697                purl,
698                extracted_requirement,
699                scope: Some(scope.to_string()),
700                is_runtime: Some(is_runtime),
701                is_optional: Some(is_optional),
702                is_pinned: None,
703                is_direct: Some(true),
704                resolved_package: None,
705                extra_data: None,
706            })
707        })
708        .collect()
709}
710
711crate::register_parser!(
712    "CPAN Perl META.json",
713    &["**/META.json"],
714    "cpan",
715    "Perl",
716    Some("https://metacpan.org/pod/CPAN::Meta::Spec"),
717);
718
719crate::register_parser!(
720    "CPAN Perl META.yml",
721    &["**/META.yml"],
722    "cpan",
723    "Perl",
724    Some("https://metacpan.org/pod/CPAN::Meta::Spec"),
725);
726
727crate::register_parser!(
728    "CPAN Perl MANIFEST",
729    &["**/MANIFEST"],
730    "cpan",
731    "Perl",
732    Some("https://metacpan.org/pod/Module::Manifest"),
733);