Skip to main content

provenant/parsers/
cpan_makefile_pl.rs

1// SPDX-FileCopyrightText: Provenant contributors
2// SPDX-License-Identifier: Apache-2.0
3
4//! Parser for CPAN Perl Makefile.PL files.
5//!
6//! Extracts Perl package metadata from `Makefile.PL` files used by ExtUtils::MakeMaker.
7//!
8//! # Supported Formats
9//! - `Makefile.PL` - CPAN ExtUtils::MakeMaker build configuration
10//!
11//! # Implementation Notes
12//! - Format: Perl script with WriteMakefile() or WriteMakefile1() calls
13//! - Spec: https://metacpan.org/pod/ExtUtils::MakeMaker
14//! - Extracts: NAME, VERSION, AUTHOR, LICENSE, ABSTRACT, PREREQ_PM, BUILD_REQUIRES, TEST_REQUIRES, CONFIGURE_REQUIRES
15//! - Uses regex-based extraction (no Perl code execution for security)
16//! - Python reference has stub-only handler with no parse() method - this is BEYOND PARITY
17
18use std::collections::HashMap;
19use std::path::Path;
20use std::sync::LazyLock;
21
22use crate::parser_warn as warn;
23use crate::parsers::utils::{MAX_ITERATION_COUNT, read_file_to_string, truncate_field};
24use packageurl::PackageUrl;
25use regex::Regex;
26use serde_json::json;
27
28use crate::models::{DatasourceId, Dependency, PackageData, PackageType, Party};
29
30use super::PackageParser;
31use super::license_normalization::{
32    DeclaredLicenseMatchMetadata, NormalizedDeclaredLicense, build_declared_license_data,
33    empty_declared_license_data, normalize_declared_license_key, normalize_spdx_expression,
34};
35
36static RE_WRITEMAKEFILE: LazyLock<Regex> = LazyLock::new(|| {
37    Regex::new(r"WriteMakefile1?\s*\(").expect("valid regex: WriteMakefile call pattern")
38});
39static RE_SIMPLE_KV: LazyLock<Regex> = LazyLock::new(|| {
40    Regex::new(r#"(?m)^\s*([A-Z_]+)\s*=>\s*(?:'([^']*)'|"([^"]*)"|q\{([^}]*)\}|q\(([^)]*)\))"#)
41        .expect("valid regex: simple key=>value pattern")
42});
43static RE_HASH_BLOCK: LazyLock<Regex> = LazyLock::new(|| {
44    Regex::new(r"([A-Z_]+)\s*=>\s*\{([^}]*)\}").expect("valid regex: hash block pattern")
45});
46static RE_AUTHOR_ARRAY: LazyLock<Regex> = LazyLock::new(|| {
47    Regex::new(r"AUTHOR\s*=>\s*\[([^\]]*)\]").expect("valid regex: AUTHOR array pattern")
48});
49static RE_QUOTED_STRING: LazyLock<Regex> = LazyLock::new(|| {
50    Regex::new(r#"['"]([^'"]*)['"']"#).expect("valid regex: quoted string pattern")
51});
52static RE_DEP_PAIR: LazyLock<Regex> = LazyLock::new(|| {
53    Regex::new(r#"['"]([^'"]+)['"]\s*=>\s*(?:'([^']*)'|"([^"]*)"|(\d+))"#)
54        .expect("valid regex: dependency pair pattern")
55});
56static RE_VERSION_ASSIGNMENT: LazyLock<Regex> = LazyLock::new(|| {
57    Regex::new(
58        r#"(?m)^\s*(?:our\s+)?\$(?:[A-Za-z_][\w:]*::)?VERSION\s*=\s*(?:'([^']+)'|"([^"]+)")"#,
59    )
60    .expect("valid regex: VERSION assignment pattern")
61});
62
63const PACKAGE_TYPE: PackageType = PackageType::Cpan;
64const MAX_METADATA_FILE_SIZE: u64 = 1024 * 1024;
65
66pub struct CpanMakefilePlParser;
67
68impl PackageParser for CpanMakefilePlParser {
69    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
70
71    fn is_match(path: &Path) -> bool {
72        path.file_name().is_some_and(|name| name == "Makefile.PL")
73    }
74
75    fn extract_packages(path: &Path) -> Vec<PackageData> {
76        let content = match read_file_to_string(path, None) {
77            Ok(c) => c,
78            Err(e) => {
79                warn!("Failed to read Makefile.PL file {:?}: {}", path, e);
80                return vec![PackageData {
81                    package_type: Some(PACKAGE_TYPE),
82                    primary_language: Some("Perl".to_string()),
83                    datasource_id: Some(DatasourceId::CpanMakefile),
84                    ..Default::default()
85                }];
86            }
87        };
88
89        vec![parse_makefile_pl_with_base(&content, path.parent())]
90    }
91}
92
93#[cfg(test)]
94pub(crate) fn parse_makefile_pl(content: &str) -> PackageData {
95    parse_makefile_pl_with_base(content, None)
96}
97
98pub(crate) fn parse_makefile_pl_with_base(content: &str, base_dir: Option<&Path>) -> PackageData {
99    // Find WriteMakefile or WriteMakefile1 call
100    let makefile_block = extract_writemakefile_block(content);
101    if makefile_block.is_empty() {
102        return default_package_data();
103    }
104
105    let fields = parse_hash_fields(&makefile_block);
106
107    let name = fields.get("NAME").and_then(|n| sanitize_scalar_field(n));
108    let resolved_metadata = resolve_referenced_metadata(&fields, base_dir);
109
110    let version = fields
111        .get("VERSION")
112        .and_then(|v| sanitize_scalar_field(v))
113        .or_else(|| resolved_metadata.version.clone());
114    let description = fields
115        .get("ABSTRACT")
116        .and_then(|d| sanitize_scalar_field(d))
117        .or_else(|| resolved_metadata.abstract_text.clone());
118    let extracted_license_statement = fields.get("LICENSE").and_then(|l| sanitize_scalar_field(l));
119    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
120        extracted_license_statement
121            .as_deref()
122            .and_then(normalize_cpan_makefile_license)
123            .map(|normalized| {
124                build_declared_license_data(
125                    normalized,
126                    DeclaredLicenseMatchMetadata::single_line(
127                        extracted_license_statement.as_deref().unwrap_or_default(),
128                    ),
129                )
130            })
131            .unwrap_or_else(empty_declared_license_data);
132
133    let parties = parse_author(&fields);
134    let dependencies = parse_dependencies(&fields);
135
136    let mut extra_data = HashMap::new();
137    if let Some(min_perl) = fields
138        .get("MIN_PERL_VERSION")
139        .and_then(|value| sanitize_scalar_field(value))
140    {
141        extra_data.insert("MIN_PERL_VERSION".to_string(), json!(min_perl));
142    }
143    if let Some(version_from) = fields
144        .get("VERSION_FROM")
145        .and_then(|value| sanitize_scalar_field(value))
146    {
147        extra_data.insert("VERSION_FROM".to_string(), json!(version_from));
148    }
149    if let Some(abstract_from) = fields
150        .get("ABSTRACT_FROM")
151        .and_then(|value| sanitize_scalar_field(value))
152    {
153        extra_data.insert("ABSTRACT_FROM".to_string(), json!(abstract_from));
154    }
155
156    // Build PURL: convert Foo::Bar to Foo-Bar for CPAN naming convention
157    let purl = name.as_ref().and_then(|n| {
158        let purl_name = n.replace("::", "-");
159        PackageUrl::new("cpan", &purl_name).ok().map(|mut p| {
160            if let Some(v) = &version {
161                let _ = p.with_version(v).ok();
162            }
163            p.to_string()
164        })
165    });
166
167    PackageData {
168        package_type: Some(PACKAGE_TYPE),
169        namespace: Some("cpan".to_string()),
170        name,
171        version,
172        description,
173        declared_license_expression,
174        declared_license_expression_spdx,
175        license_detections,
176        extracted_license_statement,
177        parties,
178        dependencies,
179        extra_data: if extra_data.is_empty() {
180            None
181        } else {
182            Some(extra_data)
183        },
184        purl,
185        datasource_id: Some(DatasourceId::CpanMakefile),
186        primary_language: Some("Perl".to_string()),
187        ..Default::default()
188    }
189}
190
191#[derive(Default)]
192struct ResolvedMetadata {
193    version: Option<String>,
194    abstract_text: Option<String>,
195}
196
197fn default_package_data() -> PackageData {
198    PackageData {
199        package_type: Some(PACKAGE_TYPE),
200        primary_language: Some("Perl".to_string()),
201        datasource_id: Some(DatasourceId::CpanMakefile),
202        ..Default::default()
203    }
204}
205
206fn normalize_cpan_makefile_license(value: &str) -> Option<NormalizedDeclaredLicense> {
207    match value.trim() {
208        "perl_5" | "Perl_5" => Some(NormalizedDeclaredLicense::new(
209            "gpl-1.0-plus OR artistic-perl-1.0",
210            "GPL-1.0-or-later OR Artistic-1.0-Perl",
211        )),
212        "artistic_2" => Some(NormalizedDeclaredLicense::new(
213            "artistic-2.0",
214            "Artistic-2.0",
215        )),
216        "apache_2_0" => Some(NormalizedDeclaredLicense::new("apache-2.0", "Apache-2.0")),
217        other => normalize_spdx_expression(other).or_else(|| normalize_declared_license_key(other)),
218    }
219}
220
221fn sanitize_scalar_field(value: &str) -> Option<String> {
222    let trimmed = value.trim();
223    if trimmed.is_empty() || looks_like_unresolved_template_value(trimmed) {
224        return None;
225    }
226
227    Some(truncate_field(trimmed.to_string()))
228}
229
230fn looks_like_unresolved_template_value(value: &str) -> bool {
231    let trimmed = value.trim();
232    let uppercase = trimmed.to_ascii_uppercase();
233
234    trimmed.contains("[%")
235        || trimmed.contains("%]")
236        || trimmed.contains("<%")
237        || trimmed.contains("%>")
238        || (trimmed.contains("{{") && trimmed.contains("}}"))
239        || trimmed.contains("${{")
240        || trimmed.contains("[d2%")
241        || trimmed.contains("%2d]")
242        || matches!(
243            uppercase.as_str(),
244            "YOUR NAME" | "YOUR APPLICATION ABSTRACT" | "YOUREMAIL@EXAMPLE.COM"
245        )
246}
247
248fn resolve_referenced_metadata(
249    fields: &HashMap<String, String>,
250    base_dir: Option<&Path>,
251) -> ResolvedMetadata {
252    let Some(base_dir) = base_dir else {
253        return ResolvedMetadata::default();
254    };
255
256    let mut resolved = ResolvedMetadata::default();
257    let mut cache: HashMap<String, Option<String>> = HashMap::new();
258
259    if let Some(version_from) = fields.get("VERSION_FROM")
260        && !looks_like_unresolved_template_value(version_from)
261        && let Some(content) = load_referenced_metadata_file(base_dir, version_from, &mut cache)
262    {
263        resolved.version = extract_version_from_module_content(content);
264    }
265
266    if let Some(abstract_from) = fields.get("ABSTRACT_FROM")
267        && !looks_like_unresolved_template_value(abstract_from)
268        && let Some(content) = load_referenced_metadata_file(base_dir, abstract_from, &mut cache)
269    {
270        resolved.abstract_text = extract_abstract_from_module_content(content);
271    }
272
273    resolved
274}
275
276fn load_referenced_metadata_file<'a>(
277    base_dir: &Path,
278    relative_path: &str,
279    cache: &'a mut HashMap<String, Option<String>>,
280) -> Option<&'a String> {
281    let entry = cache
282        .entry(relative_path.to_string())
283        .or_insert_with(|| read_safe_metadata_file(base_dir, relative_path));
284    entry.as_ref()
285}
286
287fn read_safe_metadata_file(base_dir: &Path, relative_path: &str) -> Option<String> {
288    let ref_path = Path::new(relative_path);
289    if ref_path.is_absolute() {
290        return None;
291    }
292
293    let base_dir = base_dir.canonicalize().ok()?;
294    let candidate = base_dir.join(ref_path);
295    let canonical_candidate = candidate.canonicalize().ok()?;
296    if !canonical_candidate.starts_with(&base_dir) {
297        return None;
298    }
299
300    let metadata = std::fs::metadata(&canonical_candidate).ok()?;
301    if !metadata.is_file() || metadata.len() > MAX_METADATA_FILE_SIZE {
302        return None;
303    }
304
305    read_file_to_string(&canonical_candidate, None).ok()
306}
307
308fn extract_version_from_module_content(content: &str) -> Option<String> {
309    RE_VERSION_ASSIGNMENT
310        .captures(content)
311        .and_then(|caps| caps.get(1).or_else(|| caps.get(2)))
312        .map(|m| m.as_str().trim().to_string())
313        .map(truncate_field)
314        .filter(|value| !value.is_empty())
315}
316
317fn extract_abstract_from_module_content(content: &str) -> Option<String> {
318    let mut in_name_section = false;
319
320    for line in content.lines() {
321        let trimmed = line.trim();
322        if trimmed == "=head1 NAME" {
323            in_name_section = true;
324            continue;
325        }
326
327        if in_name_section {
328            if trimmed.starts_with('=') {
329                break;
330            }
331            if trimmed.is_empty() {
332                continue;
333            }
334
335            if let Some((_, abstract_text)) = trimmed.split_once(" - ") {
336                let abstract_text = abstract_text.trim();
337                if !abstract_text.is_empty() {
338                    return Some(truncate_field(abstract_text.to_string()));
339                }
340            }
341        }
342    }
343
344    None
345}
346
347fn extract_writemakefile_block(content: &str) -> String {
348    let start_match = match RE_WRITEMAKEFILE.find(content) {
349        Some(m) => m,
350        None => return String::new(),
351    };
352
353    let start_pos = start_match.end();
354    let content_from_start = &content[start_pos..];
355
356    // Find the matching closing parenthesis
357    let mut depth = 1;
358    let mut end_pos = 0;
359    let chars: Vec<char> = content_from_start.chars().collect();
360
361    for (i, &ch) in chars.iter().enumerate() {
362        if i >= MAX_ITERATION_COUNT {
363            break;
364        }
365        match ch {
366            '(' => depth += 1,
367            ')' => {
368                depth -= 1;
369                if depth == 0 {
370                    end_pos = i;
371                    break;
372                }
373            }
374            _ => {}
375        }
376    }
377
378    if end_pos > 0 {
379        content_from_start[..end_pos].to_string()
380    } else {
381        String::new()
382    }
383}
384
385fn parse_hash_fields(content: &str) -> HashMap<String, String> {
386    let mut fields = HashMap::new();
387
388    for cap in RE_SIMPLE_KV
389        .captures_iter(content)
390        .take(MAX_ITERATION_COUNT)
391    {
392        let key = cap.get(1).map(|m| m.as_str()).unwrap_or("").to_string();
393        let value = cap
394            .get(2)
395            .or_else(|| cap.get(3))
396            .or_else(|| cap.get(4))
397            .or_else(|| cap.get(5))
398            .map(|m| m.as_str().to_string());
399
400        if let Some(v) = value {
401            fields.insert(key, v);
402        }
403    }
404
405    // Parse hash values (PREREQ_PM, BUILD_REQUIRES, etc.)
406    parse_hash_dependencies(content, &mut fields);
407
408    // Parse array refs for AUTHOR
409    parse_author_array(content, &mut fields);
410
411    fields
412}
413
414fn parse_hash_dependencies(content: &str, fields: &mut HashMap<String, String>) {
415    for cap in RE_HASH_BLOCK
416        .captures_iter(content)
417        .take(MAX_ITERATION_COUNT)
418    {
419        let key = cap.get(1).map(|m| m.as_str()).unwrap_or("");
420        let hash_content = cap.get(2).map(|m| m.as_str()).unwrap_or("");
421
422        // For dependency hashes, we'll store them with a special marker
423        // so parse_dependencies can find them
424        if matches!(
425            key,
426            "PREREQ_PM" | "BUILD_REQUIRES" | "TEST_REQUIRES" | "CONFIGURE_REQUIRES"
427        ) {
428            fields.insert(format!("_HASH_{}", key), hash_content.to_string());
429        }
430    }
431}
432
433fn parse_author_array(content: &str, fields: &mut HashMap<String, String>) {
434    if let Some(cap) = RE_AUTHOR_ARRAY.captures(content) {
435        let array_content = cap.get(1).map(|m| m.as_str()).unwrap_or("");
436
437        let authors: Vec<String> = RE_QUOTED_STRING
438            .captures_iter(array_content)
439            .take(MAX_ITERATION_COUNT)
440            .filter_map(|c| c.get(1).map(|m| m.as_str().to_string()))
441            .collect();
442
443        if !authors.is_empty() {
444            // Store as JSON array for later processing
445            fields.insert("_ARRAY_AUTHOR".to_string(), authors.join("||"));
446        }
447    }
448}
449
450fn parse_author(fields: &HashMap<String, String>) -> Vec<Party> {
451    // Check for array of authors first
452    if let Some(authors_str) = fields.get("_ARRAY_AUTHOR") {
453        return authors_str
454            .split("||")
455            .filter_map(|author_str| {
456                if author_str.trim().is_empty() {
457                    return None;
458                }
459                let (name, email) = parse_author_string(author_str);
460                build_author_party(name, email)
461            })
462            .collect();
463    }
464
465    if let Some(author_str) = fields.get("AUTHOR") {
466        let (name, email) = parse_author_string(author_str);
467        return build_author_party(name, email).into_iter().collect();
468    }
469
470    Vec::new()
471}
472
473fn build_author_party(name: Option<String>, email: Option<String>) -> Option<Party> {
474    if name.is_none() && email.is_none() {
475        return None;
476    }
477
478    Some(Party {
479        role: Some("author".to_string()),
480        name,
481        email,
482        r#type: Some("person".to_string()),
483        url: None,
484        organization: None,
485        organization_url: None,
486        timezone: None,
487    })
488}
489
490fn parse_author_string(s: &str) -> (Option<String>, Option<String>) {
491    if let Some(start) = s.find('<')
492        && let Some(end) = s.find('>')
493        && start < end
494    {
495        let name = s[..start].trim();
496        let email = s[start + 1..end].trim();
497        return (sanitize_scalar_field(name), sanitize_scalar_field(email));
498    }
499    (sanitize_scalar_field(s), None)
500}
501
502fn parse_dependencies(fields: &HashMap<String, String>) -> Vec<Dependency> {
503    let mut dependencies = Vec::new();
504
505    // Parse PREREQ_PM as runtime dependencies
506    if let Some(hash_content) = fields.get("_HASH_PREREQ_PM") {
507        dependencies.extend(extract_deps_from_hash(hash_content, "runtime", true));
508    }
509
510    // Parse BUILD_REQUIRES
511    if let Some(hash_content) = fields.get("_HASH_BUILD_REQUIRES") {
512        dependencies.extend(extract_deps_from_hash(hash_content, "build", false));
513    }
514
515    // Parse TEST_REQUIRES
516    if let Some(hash_content) = fields.get("_HASH_TEST_REQUIRES") {
517        dependencies.extend(extract_deps_from_hash(hash_content, "test", false));
518    }
519
520    // Parse CONFIGURE_REQUIRES
521    if let Some(hash_content) = fields.get("_HASH_CONFIGURE_REQUIRES") {
522        dependencies.extend(extract_deps_from_hash(hash_content, "configure", false));
523    }
524
525    dependencies
526}
527
528fn extract_deps_from_hash(hash_content: &str, scope: &str, is_runtime: bool) -> Vec<Dependency> {
529    let mut deps = Vec::new();
530
531    for cap in RE_DEP_PAIR
532        .captures_iter(hash_content)
533        .take(MAX_ITERATION_COUNT)
534    {
535        let module_name = cap.get(1).map(|m| m.as_str()).unwrap_or("");
536
537        // Skip perl itself
538        if module_name == "perl" {
539            continue;
540        }
541
542        let version = cap
543            .get(2)
544            .or_else(|| cap.get(3))
545            .or_else(|| cap.get(4))
546            .map(|m| m.as_str());
547
548        let extracted_requirement = match version {
549            Some("0") | Some("") | None => None,
550            Some(v) => Some(truncate_field(v.to_string())),
551        };
552
553        let purl = PackageUrl::new("cpan", module_name)
554            .ok()
555            .map(|p| p.to_string());
556
557        deps.push(Dependency {
558            purl,
559            extracted_requirement,
560            scope: Some(truncate_field(scope.to_string())),
561            is_runtime: Some(is_runtime),
562            is_optional: Some(false),
563            is_pinned: None,
564            is_direct: Some(true),
565            resolved_package: None,
566            extra_data: None,
567        });
568    }
569
570    deps
571}
572
573crate::register_parser!(
574    "CPAN Perl Makefile.PL",
575    &["*/Makefile.PL"],
576    "cpan",
577    "Perl",
578    Some("https://metacpan.org/pod/ExtUtils::MakeMaker"),
579);