Skip to main content

provenant/parsers/
cpan_makefile_pl.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Parser for CPAN Perl Makefile.PL files.
5//!
6//! Extracts Perl package metadata from `Makefile.PL` files used by ExtUtils::MakeMaker.
7//!
8//! # Supported Formats
9//! - `Makefile.PL` - CPAN ExtUtils::MakeMaker build configuration
10//!
11//! # Implementation Notes
12//! - Format: Perl script with WriteMakefile() or WriteMakefile1() calls
13//! - Spec: https://metacpan.org/pod/ExtUtils::MakeMaker
14//! - Extracts: NAME, VERSION, AUTHOR, LICENSE, ABSTRACT, PREREQ_PM, BUILD_REQUIRES, TEST_REQUIRES, CONFIGURE_REQUIRES
15//! - Uses regex-based extraction (no Perl code execution for security)
16//! - Python reference has stub-only handler with no parse() method - this is BEYOND PARITY
17
18use std::collections::HashMap;
19use std::path::Path;
20use std::sync::LazyLock;
21
22use crate::parser_warn as warn;
23use crate::parsers::utils::{MAX_ITERATION_COUNT, read_file_to_string, truncate_field};
24use packageurl::PackageUrl;
25use regex::Regex;
26use serde_json::json;
27
28use crate::models::{DatasourceId, Dependency, PackageData, PackageType, Party};
29
30use super::PackageParser;
31use super::license_normalization::{
32    DeclaredLicenseMatchMetadata, NormalizedDeclaredLicense, build_declared_license_data,
33    empty_declared_license_data, normalize_declared_license_key, normalize_spdx_expression,
34};
35
36static RE_WRITEMAKEFILE: LazyLock<Regex> = LazyLock::new(|| {
37    Regex::new(r"WriteMakefile1?\s*\(").expect("valid regex: WriteMakefile call pattern")
38});
39static RE_SIMPLE_KV: LazyLock<Regex> = LazyLock::new(|| {
40    Regex::new(r#"(?m)^\s*([A-Z_]+)\s*=>\s*(?:'([^']*)'|"([^"]*)"|q\{([^}]*)\}|q\(([^)]*)\))"#)
41        .expect("valid regex: simple key=>value pattern")
42});
43static RE_HASH_BLOCK: LazyLock<Regex> = LazyLock::new(|| {
44    Regex::new(r"([A-Z_]+)\s*=>\s*\{([^}]*)\}").expect("valid regex: hash block pattern")
45});
46static RE_AUTHOR_ARRAY: LazyLock<Regex> = LazyLock::new(|| {
47    Regex::new(r"AUTHOR\s*=>\s*\[([^\]]*)\]").expect("valid regex: AUTHOR array pattern")
48});
49static RE_QUOTED_STRING: LazyLock<Regex> = LazyLock::new(|| {
50    Regex::new(r#"['"]([^'"]*)['"']"#).expect("valid regex: quoted string pattern")
51});
52static RE_DEP_PAIR: LazyLock<Regex> = LazyLock::new(|| {
53    Regex::new(r#"['"]([^'"]+)['"]\s*=>\s*(?:'([^']*)'|"([^"]*)"|(\d+))"#)
54        .expect("valid regex: dependency pair pattern")
55});
56static RE_VERSION_ASSIGNMENT: LazyLock<Regex> = LazyLock::new(|| {
57    Regex::new(
58        r#"(?m)^\s*(?:our\s+)?\$(?:[A-Za-z_][\w:]*::)?VERSION\s*=\s*(?:'([^']+)'|"([^"]+)")"#,
59    )
60    .expect("valid regex: VERSION assignment pattern")
61});
62
63const PACKAGE_TYPE: PackageType = PackageType::Cpan;
64const MAX_METADATA_FILE_SIZE: u64 = 1024 * 1024;
65
66pub struct CpanMakefilePlParser;
67
68impl PackageParser for CpanMakefilePlParser {
69    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
70
71    fn is_match(path: &Path) -> bool {
72        path.file_name().is_some_and(|name| name == "Makefile.PL")
73    }
74
75    fn extract_packages(path: &Path) -> Vec<PackageData> {
76        let content = match read_file_to_string(path, None) {
77            Ok(c) => c,
78            Err(e) => {
79                warn!("Failed to read Makefile.PL file {:?}: {}", path, e);
80                return vec![PackageData {
81                    package_type: Some(PACKAGE_TYPE),
82                    primary_language: Some("Perl".to_string()),
83                    datasource_id: Some(DatasourceId::CpanMakefile),
84                    ..Default::default()
85                }];
86            }
87        };
88
89        vec![parse_makefile_pl_with_base(&content, path.parent())]
90    }
91
92    fn metadata() -> Vec<super::metadata::ParserMetadata> {
93        vec![super::metadata::ParserMetadata {
94            description: "CPAN Perl Makefile.PL",
95            file_patterns: &["*/Makefile.PL"],
96            package_type: "cpan",
97            primary_language: "Perl",
98            documentation_url: Some("https://metacpan.org/pod/ExtUtils::MakeMaker"),
99        }]
100    }
101}
102
103#[cfg(test)]
104pub(crate) fn parse_makefile_pl(content: &str) -> PackageData {
105    parse_makefile_pl_with_base(content, None)
106}
107
108pub(crate) fn parse_makefile_pl_with_base(content: &str, base_dir: Option<&Path>) -> PackageData {
109    // Find WriteMakefile or WriteMakefile1 call
110    let makefile_block = extract_writemakefile_block(content);
111    if makefile_block.is_empty() {
112        return default_package_data();
113    }
114
115    let fields = parse_hash_fields(&makefile_block);
116
117    let name = fields.get("NAME").and_then(|n| sanitize_scalar_field(n));
118    let resolved_metadata = resolve_referenced_metadata(&fields, base_dir);
119
120    let version = fields
121        .get("VERSION")
122        .and_then(|v| sanitize_scalar_field(v))
123        .or_else(|| resolved_metadata.version.clone());
124    let description = fields
125        .get("ABSTRACT")
126        .and_then(|d| sanitize_scalar_field(d))
127        .or_else(|| resolved_metadata.abstract_text.clone());
128    let extracted_license_statement = fields.get("LICENSE").and_then(|l| sanitize_scalar_field(l));
129    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
130        extracted_license_statement
131            .as_deref()
132            .and_then(normalize_cpan_makefile_license)
133            .map(|normalized| {
134                build_declared_license_data(
135                    normalized,
136                    DeclaredLicenseMatchMetadata::single_line(
137                        extracted_license_statement.as_deref().unwrap_or_default(),
138                    ),
139                )
140            })
141            .unwrap_or_else(empty_declared_license_data);
142
143    let parties = parse_author(&fields);
144    let dependencies = parse_dependencies(&fields);
145
146    let mut extra_data = HashMap::new();
147    if let Some(min_perl) = fields
148        .get("MIN_PERL_VERSION")
149        .and_then(|value| sanitize_scalar_field(value))
150    {
151        extra_data.insert("MIN_PERL_VERSION".to_string(), json!(min_perl));
152    }
153    if let Some(version_from) = fields
154        .get("VERSION_FROM")
155        .and_then(|value| sanitize_scalar_field(value))
156    {
157        extra_data.insert("VERSION_FROM".to_string(), json!(version_from));
158    }
159    if let Some(abstract_from) = fields
160        .get("ABSTRACT_FROM")
161        .and_then(|value| sanitize_scalar_field(value))
162    {
163        extra_data.insert("ABSTRACT_FROM".to_string(), json!(abstract_from));
164    }
165
166    // Build PURL: convert Foo::Bar to Foo-Bar for CPAN naming convention
167    let purl = name.as_ref().and_then(|n| {
168        let purl_name = n.replace("::", "-");
169        PackageUrl::new("cpan", &purl_name).ok().map(|mut p| {
170            if let Some(v) = &version {
171                let _ = p.with_version(v).ok();
172            }
173            p.to_string()
174        })
175    });
176
177    PackageData {
178        package_type: Some(PACKAGE_TYPE),
179        namespace: Some("cpan".to_string()),
180        name,
181        version,
182        description,
183        declared_license_expression,
184        declared_license_expression_spdx,
185        license_detections,
186        extracted_license_statement,
187        parties,
188        dependencies,
189        extra_data: if extra_data.is_empty() {
190            None
191        } else {
192            Some(extra_data)
193        },
194        purl,
195        datasource_id: Some(DatasourceId::CpanMakefile),
196        primary_language: Some("Perl".to_string()),
197        ..Default::default()
198    }
199}
200
201#[derive(Default)]
202struct ResolvedMetadata {
203    version: Option<String>,
204    abstract_text: Option<String>,
205}
206
207fn default_package_data() -> PackageData {
208    PackageData {
209        package_type: Some(PACKAGE_TYPE),
210        primary_language: Some("Perl".to_string()),
211        datasource_id: Some(DatasourceId::CpanMakefile),
212        ..Default::default()
213    }
214}
215
216fn normalize_cpan_makefile_license(value: &str) -> Option<NormalizedDeclaredLicense> {
217    match value.trim() {
218        "perl_5" | "Perl_5" => Some(NormalizedDeclaredLicense::new(
219            "gpl-1.0-plus OR artistic-perl-1.0",
220            "GPL-1.0-or-later OR Artistic-1.0-Perl",
221        )),
222        "artistic_2" => Some(NormalizedDeclaredLicense::new(
223            "artistic-2.0",
224            "Artistic-2.0",
225        )),
226        "apache_2_0" => Some(NormalizedDeclaredLicense::new("apache-2.0", "Apache-2.0")),
227        other => normalize_spdx_expression(other).or_else(|| normalize_declared_license_key(other)),
228    }
229}
230
231fn sanitize_scalar_field(value: &str) -> Option<String> {
232    let trimmed = value.trim();
233    if trimmed.is_empty() || looks_like_unresolved_template_value(trimmed) {
234        return None;
235    }
236
237    Some(truncate_field(trimmed.to_string()))
238}
239
240fn looks_like_unresolved_template_value(value: &str) -> bool {
241    let trimmed = value.trim();
242    let uppercase = trimmed.to_ascii_uppercase();
243
244    trimmed.contains("[%")
245        || trimmed.contains("%]")
246        || trimmed.contains("<%")
247        || trimmed.contains("%>")
248        || (trimmed.contains("{{") && trimmed.contains("}}"))
249        || trimmed.contains("${{")
250        || trimmed.contains("[d2%")
251        || trimmed.contains("%2d]")
252        || matches!(
253            uppercase.as_str(),
254            "YOUR NAME" | "YOUR APPLICATION ABSTRACT" | "YOUREMAIL@EXAMPLE.COM"
255        )
256}
257
258fn resolve_referenced_metadata(
259    fields: &HashMap<String, String>,
260    base_dir: Option<&Path>,
261) -> ResolvedMetadata {
262    let Some(base_dir) = base_dir else {
263        return ResolvedMetadata::default();
264    };
265
266    let mut resolved = ResolvedMetadata::default();
267    let mut cache: HashMap<String, Option<String>> = HashMap::new();
268
269    if let Some(version_from) = fields.get("VERSION_FROM")
270        && !looks_like_unresolved_template_value(version_from)
271        && let Some(content) = load_referenced_metadata_file(base_dir, version_from, &mut cache)
272    {
273        resolved.version = extract_version_from_module_content(content);
274    }
275
276    if let Some(abstract_from) = fields.get("ABSTRACT_FROM")
277        && !looks_like_unresolved_template_value(abstract_from)
278        && let Some(content) = load_referenced_metadata_file(base_dir, abstract_from, &mut cache)
279    {
280        resolved.abstract_text = extract_abstract_from_module_content(content);
281    }
282
283    resolved
284}
285
286fn load_referenced_metadata_file<'a>(
287    base_dir: &Path,
288    relative_path: &str,
289    cache: &'a mut HashMap<String, Option<String>>,
290) -> Option<&'a String> {
291    let entry = cache
292        .entry(relative_path.to_string())
293        .or_insert_with(|| read_safe_metadata_file(base_dir, relative_path));
294    entry.as_ref()
295}
296
297fn read_safe_metadata_file(base_dir: &Path, relative_path: &str) -> Option<String> {
298    let ref_path = Path::new(relative_path);
299    if ref_path.is_absolute() {
300        return None;
301    }
302
303    let base_dir = base_dir.canonicalize().ok()?;
304    let candidate = base_dir.join(ref_path);
305    let canonical_candidate = candidate.canonicalize().ok()?;
306    if !canonical_candidate.starts_with(&base_dir) {
307        return None;
308    }
309
310    let metadata = std::fs::metadata(&canonical_candidate).ok()?;
311    if !metadata.is_file() || metadata.len() > MAX_METADATA_FILE_SIZE {
312        return None;
313    }
314
315    read_file_to_string(&canonical_candidate, None).ok()
316}
317
318fn extract_version_from_module_content(content: &str) -> Option<String> {
319    RE_VERSION_ASSIGNMENT
320        .captures(content)
321        .and_then(|caps| caps.get(1).or_else(|| caps.get(2)))
322        .map(|m| m.as_str().trim().to_string())
323        .map(truncate_field)
324        .filter(|value| !value.is_empty())
325}
326
327fn extract_abstract_from_module_content(content: &str) -> Option<String> {
328    let mut in_name_section = false;
329
330    for line in content.lines() {
331        let trimmed = line.trim();
332        if trimmed == "=head1 NAME" {
333            in_name_section = true;
334            continue;
335        }
336
337        if in_name_section {
338            if trimmed.starts_with('=') {
339                break;
340            }
341            if trimmed.is_empty() {
342                continue;
343            }
344
345            if let Some((_, abstract_text)) = trimmed.split_once(" - ") {
346                let abstract_text = abstract_text.trim();
347                if !abstract_text.is_empty() {
348                    return Some(truncate_field(abstract_text.to_string()));
349                }
350            }
351        }
352    }
353
354    None
355}
356
357fn extract_writemakefile_block(content: &str) -> String {
358    let start_match = match RE_WRITEMAKEFILE.find(content) {
359        Some(m) => m,
360        None => return String::new(),
361    };
362
363    let start_pos = start_match.end();
364    let content_from_start = &content[start_pos..];
365
366    // Find the matching closing parenthesis
367    let mut depth = 1;
368    let mut end_pos = 0;
369    let chars: Vec<char> = content_from_start.chars().collect();
370
371    for (i, &ch) in chars.iter().enumerate() {
372        if i >= MAX_ITERATION_COUNT {
373            break;
374        }
375        match ch {
376            '(' => depth += 1,
377            ')' => {
378                depth -= 1;
379                if depth == 0 {
380                    end_pos = i;
381                    break;
382                }
383            }
384            _ => {}
385        }
386    }
387
388    if end_pos > 0 {
389        content_from_start[..end_pos].to_string()
390    } else {
391        String::new()
392    }
393}
394
395fn parse_hash_fields(content: &str) -> HashMap<String, String> {
396    let mut fields = HashMap::new();
397
398    for cap in RE_SIMPLE_KV
399        .captures_iter(content)
400        .take(MAX_ITERATION_COUNT)
401    {
402        let key = cap.get(1).map(|m| m.as_str()).unwrap_or("").to_string();
403        let value = cap
404            .get(2)
405            .or_else(|| cap.get(3))
406            .or_else(|| cap.get(4))
407            .or_else(|| cap.get(5))
408            .map(|m| m.as_str().to_string());
409
410        if let Some(v) = value {
411            fields.insert(key, v);
412        }
413    }
414
415    // Parse hash values (PREREQ_PM, BUILD_REQUIRES, etc.)
416    parse_hash_dependencies(content, &mut fields);
417
418    // Parse array refs for AUTHOR
419    parse_author_array(content, &mut fields);
420
421    fields
422}
423
424fn parse_hash_dependencies(content: &str, fields: &mut HashMap<String, String>) {
425    for cap in RE_HASH_BLOCK
426        .captures_iter(content)
427        .take(MAX_ITERATION_COUNT)
428    {
429        let key = cap.get(1).map(|m| m.as_str()).unwrap_or("");
430        let hash_content = cap.get(2).map(|m| m.as_str()).unwrap_or("");
431
432        // For dependency hashes, we'll store them with a special marker
433        // so parse_dependencies can find them
434        if matches!(
435            key,
436            "PREREQ_PM" | "BUILD_REQUIRES" | "TEST_REQUIRES" | "CONFIGURE_REQUIRES"
437        ) {
438            fields.insert(format!("_HASH_{}", key), hash_content.to_string());
439        }
440    }
441}
442
443fn parse_author_array(content: &str, fields: &mut HashMap<String, String>) {
444    if let Some(cap) = RE_AUTHOR_ARRAY.captures(content) {
445        let array_content = cap.get(1).map(|m| m.as_str()).unwrap_or("");
446
447        let authors: Vec<String> = RE_QUOTED_STRING
448            .captures_iter(array_content)
449            .take(MAX_ITERATION_COUNT)
450            .filter_map(|c| c.get(1).map(|m| m.as_str().to_string()))
451            .collect();
452
453        if !authors.is_empty() {
454            // Store as JSON array for later processing
455            fields.insert("_ARRAY_AUTHOR".to_string(), authors.join("||"));
456        }
457    }
458}
459
460fn parse_author(fields: &HashMap<String, String>) -> Vec<Party> {
461    // Check for array of authors first
462    if let Some(authors_str) = fields.get("_ARRAY_AUTHOR") {
463        return authors_str
464            .split("||")
465            .filter_map(|author_str| {
466                if author_str.trim().is_empty() {
467                    return None;
468                }
469                let (name, email) = parse_author_string(author_str);
470                build_author_party(name, email)
471            })
472            .collect();
473    }
474
475    if let Some(author_str) = fields.get("AUTHOR") {
476        let (name, email) = parse_author_string(author_str);
477        return build_author_party(name, email).into_iter().collect();
478    }
479
480    Vec::new()
481}
482
483fn build_author_party(name: Option<String>, email: Option<String>) -> Option<Party> {
484    if name.is_none() && email.is_none() {
485        return None;
486    }
487
488    Some(Party {
489        role: Some("author".to_string()),
490        name,
491        email,
492        r#type: Some("person".to_string()),
493        url: None,
494        organization: None,
495        organization_url: None,
496        timezone: None,
497    })
498}
499
500fn parse_author_string(s: &str) -> (Option<String>, Option<String>) {
501    if let Some(start) = s.find('<')
502        && let Some(end) = s.find('>')
503        && start < end
504    {
505        let name = s[..start].trim();
506        let email = s[start + 1..end].trim();
507        return (sanitize_scalar_field(name), sanitize_scalar_field(email));
508    }
509    (sanitize_scalar_field(s), None)
510}
511
512fn parse_dependencies(fields: &HashMap<String, String>) -> Vec<Dependency> {
513    let mut dependencies = Vec::new();
514
515    // Parse PREREQ_PM as runtime dependencies
516    if let Some(hash_content) = fields.get("_HASH_PREREQ_PM") {
517        dependencies.extend(extract_deps_from_hash(hash_content, "runtime", true));
518    }
519
520    // Parse BUILD_REQUIRES
521    if let Some(hash_content) = fields.get("_HASH_BUILD_REQUIRES") {
522        dependencies.extend(extract_deps_from_hash(hash_content, "build", false));
523    }
524
525    // Parse TEST_REQUIRES
526    if let Some(hash_content) = fields.get("_HASH_TEST_REQUIRES") {
527        dependencies.extend(extract_deps_from_hash(hash_content, "test", false));
528    }
529
530    // Parse CONFIGURE_REQUIRES
531    if let Some(hash_content) = fields.get("_HASH_CONFIGURE_REQUIRES") {
532        dependencies.extend(extract_deps_from_hash(hash_content, "configure", false));
533    }
534
535    dependencies
536}
537
538fn extract_deps_from_hash(hash_content: &str, scope: &str, is_runtime: bool) -> Vec<Dependency> {
539    let mut deps = Vec::new();
540
541    for cap in RE_DEP_PAIR
542        .captures_iter(hash_content)
543        .take(MAX_ITERATION_COUNT)
544    {
545        let module_name = cap.get(1).map(|m| m.as_str()).unwrap_or("");
546
547        // Skip perl itself
548        if module_name == "perl" {
549            continue;
550        }
551
552        let version = cap
553            .get(2)
554            .or_else(|| cap.get(3))
555            .or_else(|| cap.get(4))
556            .map(|m| m.as_str());
557
558        let extracted_requirement = match version {
559            Some("0") | Some("") | None => None,
560            Some(v) => Some(truncate_field(v.to_string())),
561        };
562
563        let purl = PackageUrl::new("cpan", module_name)
564            .ok()
565            .map(|p| p.to_string());
566
567        deps.push(Dependency {
568            purl,
569            extracted_requirement,
570            scope: Some(truncate_field(scope.to_string())),
571            is_runtime: Some(is_runtime),
572            is_optional: Some(false),
573            is_pinned: None,
574            is_direct: Some(true),
575            resolved_package: None,
576            extra_data: None,
577        });
578    }
579
580    deps
581}