Skip to main content

provenant/parsers/
cpan.rs

1//! Parser for CPAN Perl package manifests.
2//!
3//! Extracts package metadata, dependencies, and author information from
4//! CPAN distribution files used by Perl modules.
5//!
6//! # Supported Formats
7//! - META.json (CPAN::Meta::Spec v2.0+)
8//! - META.yml (CPAN::Meta::Spec v1.4)
9//! - MANIFEST (file list)
10//!
11//! # Key Features
12//! - Full metadata extraction from META.json and META.yml (beyond Python stub handlers)
13//! - Dependency extraction for all CPAN dependency scopes (runtime, build, test, configure)
14//! - Author party information extraction
15//! - Repository URL extraction
16//! - File references from MANIFEST
17//!
18//! # Implementation Notes
19//! - Uses serde_json for JSON parsing
20//! - Uses serde_yaml for YAML parsing
21//! - Python reference has stub-only handlers with no parse() method
22//! - This is a BEYOND PARITY implementation - we extract complete metadata
23
24use std::fs;
25use std::path::Path;
26
27use log::warn;
28use packageurl::PackageUrl;
29use serde_json::Value as JsonValue;
30use serde_yaml::Value as YamlValue;
31
32use crate::models::{DatasourceId, Dependency, FileReference, PackageData, PackageType, Party};
33
34use super::PackageParser;
35
36const FIELD_NAME: &str = "name";
37const FIELD_VERSION: &str = "version";
38const FIELD_ABSTRACT: &str = "abstract";
39const FIELD_DESCRIPTION: &str = "description";
40const FIELD_LICENSE: &str = "license";
41const FIELD_AUTHOR: &str = "author";
42const FIELD_RESOURCES: &str = "resources";
43const FIELD_PREREQS: &str = "prereqs";
44const FIELD_REQUIRES: &str = "requires";
45const FIELD_BUILD_REQUIRES: &str = "build_requires";
46const FIELD_TEST_REQUIRES: &str = "test_requires";
47const FIELD_CONFIGURE_REQUIRES: &str = "configure_requires";
48
49/// CPAN META.json parser for CPAN::Meta::Spec v2.0+ metadata.
50///
51/// Extracts complete metadata from META.json files including dependencies
52/// from all scopes (runtime, build, test, configure).
53pub struct CpanMetaJsonParser;
54
55impl PackageParser for CpanMetaJsonParser {
56    const PACKAGE_TYPE: PackageType = PackageType::Cpan;
57
58    fn is_match(path: &Path) -> bool {
59        path.file_name().is_some_and(|name| name == "META.json")
60    }
61
62    fn extract_packages(path: &Path) -> Vec<PackageData> {
63        let json = match read_and_parse_json(path) {
64            Ok(json) => json,
65            Err(e) => {
66                warn!("Failed to parse META.json at {:?}: {}", path, e);
67                return vec![default_package_data(DatasourceId::CpanMetaJson)];
68            }
69        };
70
71        let name = json
72            .get(FIELD_NAME)
73            .and_then(|v| v.as_str())
74            .map(String::from);
75
76        let version = extract_version_from_json(&json);
77
78        let description = json
79            .get(FIELD_ABSTRACT)
80            .and_then(|v| v.as_str())
81            .map(String::from);
82
83        let extracted_license_statement = extract_license_from_json(&json);
84        let parties = extract_parties_from_json(&json);
85        let dependencies = extract_dependencies_from_json(&json);
86        let (homepage_url, vcs_url, code_view_url, bug_tracking_url) =
87            extract_resources_from_json(&json);
88
89        vec![PackageData {
90            package_type: Some(Self::PACKAGE_TYPE),
91            name,
92            version,
93            description,
94            extracted_license_statement,
95            parties,
96            dependencies,
97            homepage_url,
98            vcs_url,
99            code_view_url,
100            bug_tracking_url,
101            primary_language: Some("Perl".to_string()),
102            datasource_id: Some(DatasourceId::CpanMetaJson),
103            ..Default::default()
104        }]
105    }
106}
107
108/// CPAN META.yml parser for CPAN::Meta::Spec v1.4 metadata.
109///
110/// Extracts complete metadata from META.yml files with legacy dependency structure.
111pub struct CpanMetaYmlParser;
112
113impl PackageParser for CpanMetaYmlParser {
114    const PACKAGE_TYPE: PackageType = PackageType::Cpan;
115
116    fn is_match(path: &Path) -> bool {
117        path.file_name().is_some_and(|name| name == "META.yml")
118    }
119
120    fn extract_packages(path: &Path) -> Vec<PackageData> {
121        let yaml = match read_and_parse_yaml(path) {
122            Ok(yaml) => yaml,
123            Err(e) => {
124                warn!("Failed to parse META.yml at {:?}: {}", path, e);
125                return vec![default_package_data(DatasourceId::CpanMetaYml)];
126            }
127        };
128
129        let name = yaml
130            .get(FIELD_NAME)
131            .and_then(|v| v.as_str())
132            .map(String::from);
133
134        let version = extract_version_from_yaml(&yaml);
135
136        let description = yaml
137            .get(FIELD_ABSTRACT)
138            .or_else(|| yaml.get(FIELD_DESCRIPTION))
139            .and_then(|v| v.as_str())
140            .map(String::from);
141
142        let extracted_license_statement = extract_license_from_yaml(&yaml);
143        let parties = extract_parties_from_yaml(&yaml);
144        let dependencies = extract_dependencies_from_yaml(&yaml);
145        let (homepage_url, vcs_url, bug_tracking_url) = extract_resources_from_yaml(&yaml);
146
147        vec![PackageData {
148            package_type: Some(Self::PACKAGE_TYPE),
149            name,
150            version,
151            description,
152            extracted_license_statement,
153            parties,
154            dependencies,
155            homepage_url,
156            vcs_url,
157            bug_tracking_url,
158            primary_language: Some("Perl".to_string()),
159            datasource_id: Some(DatasourceId::CpanMetaYml),
160            ..Default::default()
161        }]
162    }
163}
164
165/// CPAN MANIFEST parser for module file lists.
166///
167/// Extracts file references from MANIFEST files (simple line-by-line format).
168pub struct CpanManifestParser;
169
170impl PackageParser for CpanManifestParser {
171    const PACKAGE_TYPE: PackageType = PackageType::Cpan;
172
173    fn is_match(path: &Path) -> bool {
174        path.file_name().is_some_and(|name| name == "MANIFEST")
175    }
176
177    fn extract_packages(path: &Path) -> Vec<PackageData> {
178        let content = match fs::read_to_string(path) {
179            Ok(content) => content,
180            Err(e) => {
181                warn!("Failed to read MANIFEST at {:?}: {}", path, e);
182                return vec![default_package_data(DatasourceId::CpanManifest)];
183            }
184        };
185
186        let file_references = content
187            .lines()
188            .filter(|line| !line.trim().is_empty())
189            .filter(|line| !line.trim().starts_with('#'))
190            .map(|line| {
191                // MANIFEST can have comments after whitespace
192                let path = line.split_whitespace().next().unwrap_or(line);
193                FileReference {
194                    path: path.to_string(),
195                    size: None,
196                    sha1: None,
197                    md5: None,
198                    sha256: None,
199                    sha512: None,
200                    extra_data: None,
201                }
202            })
203            .collect();
204
205        vec![PackageData {
206            package_type: Some(Self::PACKAGE_TYPE),
207            file_references,
208            primary_language: Some("Perl".to_string()),
209            datasource_id: Some(DatasourceId::CpanManifest),
210            ..Default::default()
211        }]
212    }
213}
214
215fn default_package_data(datasource_id: DatasourceId) -> PackageData {
216    PackageData {
217        package_type: Some(CpanMetaJsonParser::PACKAGE_TYPE),
218        primary_language: Some("Perl".to_string()),
219        datasource_id: Some(datasource_id),
220        ..Default::default()
221    }
222}
223
224fn read_and_parse_json(path: &Path) -> Result<serde_json::Map<String, JsonValue>, String> {
225    let content = fs::read_to_string(path).map_err(|e| format!("Failed to read file: {}", e))?;
226    let json: JsonValue =
227        serde_json::from_str(&content).map_err(|e| format!("Failed to parse JSON: {}", e))?;
228    json.as_object()
229        .cloned()
230        .ok_or_else(|| "Root JSON is not an object".to_string())
231}
232
233fn read_and_parse_yaml(path: &Path) -> Result<serde_yaml::Mapping, String> {
234    let content = fs::read_to_string(path).map_err(|e| format!("Failed to read file: {}", e))?;
235    let yaml: YamlValue =
236        serde_yaml::from_str(&content).map_err(|e| format!("Failed to parse YAML: {}", e))?;
237    yaml.as_mapping()
238        .cloned()
239        .ok_or_else(|| "Root YAML is not a mapping".to_string())
240}
241
242fn extract_version_from_json(json: &serde_json::Map<String, JsonValue>) -> Option<String> {
243    json.get(FIELD_VERSION).and_then(|v| match v {
244        JsonValue::String(s) => Some(s.clone()),
245        JsonValue::Number(n) => Some(n.to_string()),
246        _ => None,
247    })
248}
249
250fn extract_version_from_yaml(yaml: &serde_yaml::Mapping) -> Option<String> {
251    yaml.get(YamlValue::String(FIELD_VERSION.to_string()))
252        .and_then(|v| match v {
253            YamlValue::String(s) => Some(s.clone()),
254            YamlValue::Number(n) => Some(n.to_string()),
255            _ => None,
256        })
257}
258
259fn extract_license_from_json(json: &serde_json::Map<String, JsonValue>) -> Option<String> {
260    json.get(FIELD_LICENSE).and_then(|v| match v {
261        JsonValue::String(s) => Some(s.clone()),
262        JsonValue::Array(arr) => {
263            let licenses: Vec<String> = arr
264                .iter()
265                .filter_map(|item| item.as_str().map(String::from))
266                .collect();
267            if licenses.is_empty() {
268                None
269            } else {
270                Some(licenses.join(" AND "))
271            }
272        }
273        _ => None,
274    })
275}
276
277fn extract_license_from_yaml(yaml: &serde_yaml::Mapping) -> Option<String> {
278    yaml.get(YamlValue::String(FIELD_LICENSE.to_string()))
279        .and_then(|v| match v {
280            YamlValue::String(s) => Some(s.clone()),
281            YamlValue::Sequence(arr) => {
282                let licenses: Vec<String> = arr
283                    .iter()
284                    .filter_map(|item| item.as_str().map(String::from))
285                    .collect();
286                if licenses.is_empty() {
287                    None
288                } else {
289                    Some(licenses.join(" AND "))
290                }
291            }
292            _ => None,
293        })
294}
295
296fn extract_parties_from_json(json: &serde_json::Map<String, JsonValue>) -> Vec<Party> {
297    json.get(FIELD_AUTHOR)
298        .and_then(|v| v.as_array())
299        .map_or_else(Vec::new, |authors| {
300            authors
301                .iter()
302                .filter_map(|author| {
303                    author.as_str().map(|s| {
304                        let (name, email) = parse_author_string(s);
305                        Party {
306                            r#type: Some("person".to_string()),
307                            role: Some("author".to_string()),
308                            name,
309                            email,
310                            url: None,
311                            organization: None,
312                            organization_url: None,
313                            timezone: None,
314                        }
315                    })
316                })
317                .collect()
318        })
319}
320
321fn extract_parties_from_yaml(yaml: &serde_yaml::Mapping) -> Vec<Party> {
322    yaml.get(YamlValue::String(FIELD_AUTHOR.to_string()))
323        .and_then(|v| v.as_sequence())
324        .map_or_else(Vec::new, |authors| {
325            authors
326                .iter()
327                .filter_map(|author| {
328                    author.as_str().map(|s| {
329                        let (name, email) = parse_author_string(s);
330                        Party {
331                            r#type: Some("person".to_string()),
332                            role: Some("author".to_string()),
333                            name,
334                            email,
335                            url: None,
336                            organization: None,
337                            organization_url: None,
338                            timezone: None,
339                        }
340                    })
341                })
342                .collect()
343        })
344}
345
346fn parse_author_string(author_str: &str) -> (Option<String>, Option<String>) {
347    // Parse "Name <email@example.com>" format
348    if let Some(email_start) = author_str.find('<')
349        && let Some(email_end) = author_str.find('>')
350        && email_start < email_end
351    {
352        let name = author_str[..email_start].trim();
353        let email = author_str[email_start + 1..email_end].trim();
354        return (
355            if name.is_empty() {
356                None
357            } else {
358                Some(name.to_string())
359            },
360            if email.is_empty() {
361                None
362            } else {
363                Some(email.to_string())
364            },
365        );
366    }
367    // No email found, treat entire string as name
368    (Some(author_str.trim().to_string()), None)
369}
370
371fn extract_resources_from_json(
372    json: &serde_json::Map<String, JsonValue>,
373) -> (
374    Option<String>,
375    Option<String>,
376    Option<String>,
377    Option<String>,
378) {
379    let resources = match json.get(FIELD_RESOURCES).and_then(|v| v.as_object()) {
380        Some(r) => r,
381        None => return (None, None, None, None),
382    };
383
384    let homepage_url = resources
385        .get("homepage")
386        .and_then(|v| v.as_str())
387        .map(String::from);
388
389    let vcs_url = resources.get("repository").and_then(|v| match v {
390        JsonValue::String(s) => Some(s.clone()),
391        JsonValue::Object(obj) => obj.get("url").and_then(|u| u.as_str()).map(String::from),
392        _ => None,
393    });
394
395    let code_view_url = resources
396        .get("repository")
397        .and_then(|v| v.as_object())
398        .and_then(|obj| obj.get("web").and_then(|u| u.as_str()).map(String::from));
399
400    let bug_tracking_url = resources.get("bugtracker").and_then(|v| match v {
401        JsonValue::String(s) => Some(s.clone()),
402        JsonValue::Object(obj) => obj.get("web").and_then(|u| u.as_str()).map(String::from),
403        _ => None,
404    });
405
406    (homepage_url, vcs_url, code_view_url, bug_tracking_url)
407}
408
409fn extract_resources_from_yaml(
410    yaml: &serde_yaml::Mapping,
411) -> (Option<String>, Option<String>, Option<String>) {
412    let resources = match yaml
413        .get(YamlValue::String(FIELD_RESOURCES.to_string()))
414        .and_then(|v| v.as_mapping())
415    {
416        Some(r) => r,
417        None => return (None, None, None),
418    };
419
420    let homepage_url = resources
421        .get(YamlValue::String("homepage".to_string()))
422        .and_then(|v| v.as_str())
423        .map(String::from);
424
425    let vcs_url = resources
426        .get(YamlValue::String("repository".to_string()))
427        .and_then(|v| v.as_str())
428        .map(String::from);
429
430    let bug_tracking_url = resources
431        .get(YamlValue::String("bugtracker".to_string()))
432        .and_then(|v| v.as_str())
433        .map(String::from);
434
435    (homepage_url, vcs_url, bug_tracking_url)
436}
437
438fn extract_dependencies_from_json(json: &serde_json::Map<String, JsonValue>) -> Vec<Dependency> {
439    let mut dependencies = Vec::new();
440
441    let prereqs = match json.get(FIELD_PREREQS).and_then(|v| v.as_object()) {
442        Some(p) => p,
443        None => return dependencies,
444    };
445
446    // Extract runtime dependencies
447    if let Some(runtime) = prereqs.get("runtime").and_then(|v| v.as_object())
448        && let Some(requires) = runtime.get("requires").and_then(|v| v.as_object())
449    {
450        dependencies.extend(extract_dependency_group(requires, "runtime", true, false));
451    }
452
453    // Extract build dependencies
454    if let Some(build) = prereqs.get("build").and_then(|v| v.as_object())
455        && let Some(requires) = build.get("requires").and_then(|v| v.as_object())
456    {
457        dependencies.extend(extract_dependency_group(requires, "build", false, false));
458    }
459
460    // Extract test dependencies
461    if let Some(test) = prereqs.get("test").and_then(|v| v.as_object())
462        && let Some(requires) = test.get("requires").and_then(|v| v.as_object())
463    {
464        dependencies.extend(extract_dependency_group(requires, "test", false, false));
465    }
466
467    // Extract configure dependencies
468    if let Some(configure) = prereqs.get("configure").and_then(|v| v.as_object())
469        && let Some(requires) = configure.get("requires").and_then(|v| v.as_object())
470    {
471        dependencies.extend(extract_dependency_group(
472            requires,
473            "configure",
474            false,
475            false,
476        ));
477    }
478
479    dependencies
480}
481
482fn extract_dependencies_from_yaml(yaml: &serde_yaml::Mapping) -> Vec<Dependency> {
483    let mut dependencies = Vec::new();
484
485    // META.yml v1.4 has flat dependency structure
486    if let Some(requires) = yaml
487        .get(YamlValue::String(FIELD_REQUIRES.to_string()))
488        .and_then(|v| v.as_mapping())
489    {
490        dependencies.extend(extract_yaml_dependency_group(
491            requires, "runtime", true, false,
492        ));
493    }
494
495    if let Some(build_requires) = yaml
496        .get(YamlValue::String(FIELD_BUILD_REQUIRES.to_string()))
497        .and_then(|v| v.as_mapping())
498    {
499        dependencies.extend(extract_yaml_dependency_group(
500            build_requires,
501            "build",
502            false,
503            false,
504        ));
505    }
506
507    if let Some(test_requires) = yaml
508        .get(YamlValue::String(FIELD_TEST_REQUIRES.to_string()))
509        .and_then(|v| v.as_mapping())
510    {
511        dependencies.extend(extract_yaml_dependency_group(
512            test_requires,
513            "test",
514            false,
515            false,
516        ));
517    }
518
519    if let Some(configure_requires) = yaml
520        .get(YamlValue::String(FIELD_CONFIGURE_REQUIRES.to_string()))
521        .and_then(|v| v.as_mapping())
522    {
523        dependencies.extend(extract_yaml_dependency_group(
524            configure_requires,
525            "configure",
526            false,
527            false,
528        ));
529    }
530
531    dependencies
532}
533
534fn extract_dependency_group(
535    deps: &serde_json::Map<String, JsonValue>,
536    scope: &str,
537    is_runtime: bool,
538    is_optional: bool,
539) -> Vec<Dependency> {
540    deps.iter()
541        .filter_map(|(name, version)| {
542            // Skip perl itself as it's not a CPAN module
543            if name == "perl" {
544                return None;
545            }
546
547            let purl = PackageUrl::new("cpan", name).ok().map(|p| p.to_string());
548
549            let extracted_requirement = match version {
550                JsonValue::String(s) => Some(s.clone()),
551                JsonValue::Number(n) => Some(n.to_string()),
552                _ => None,
553            };
554
555            Some(Dependency {
556                purl,
557                extracted_requirement,
558                scope: Some(scope.to_string()),
559                is_runtime: Some(is_runtime),
560                is_optional: Some(is_optional),
561                is_pinned: None,
562                is_direct: Some(true),
563                resolved_package: None,
564                extra_data: None,
565            })
566        })
567        .collect()
568}
569
570fn extract_yaml_dependency_group(
571    deps: &serde_yaml::Mapping,
572    scope: &str,
573    is_runtime: bool,
574    is_optional: bool,
575) -> Vec<Dependency> {
576    deps.iter()
577        .filter_map(|(key, value)| {
578            let name = key.as_str()?;
579
580            // Skip perl itself as it's not a CPAN module
581            if name == "perl" {
582                return None;
583            }
584
585            let purl = PackageUrl::new("cpan", name).ok().map(|p| p.to_string());
586
587            let extracted_requirement = match value {
588                YamlValue::String(s) => Some(s.clone()),
589                YamlValue::Number(n) => Some(n.to_string()),
590                _ => None,
591            };
592
593            Some(Dependency {
594                purl,
595                extracted_requirement,
596                scope: Some(scope.to_string()),
597                is_runtime: Some(is_runtime),
598                is_optional: Some(is_optional),
599                is_pinned: None,
600                is_direct: Some(true),
601                resolved_package: None,
602                extra_data: None,
603            })
604        })
605        .collect()
606}
607
608crate::register_parser!(
609    "CPAN Perl META.json",
610    &["**/META.json"],
611    "cpan",
612    "Perl",
613    Some("https://metacpan.org/pod/CPAN::Meta::Spec"),
614);
615
616crate::register_parser!(
617    "CPAN Perl META.yml",
618    &["**/META.yml"],
619    "cpan",
620    "Perl",
621    Some("https://metacpan.org/pod/CPAN::Meta::Spec"),
622);
623
624crate::register_parser!(
625    "CPAN Perl MANIFEST",
626    &["**/MANIFEST"],
627    "cpan",
628    "Perl",
629    Some("https://metacpan.org/pod/Module::Manifest"),
630);