Skip to main content

provenant/parsers/
cpan_makefile_pl.rs

1//! Parser for CPAN Perl Makefile.PL files.
2//!
3//! Extracts Perl package metadata from `Makefile.PL` files used by ExtUtils::MakeMaker.
4//!
5//! # Supported Formats
6//! - `Makefile.PL` - CPAN ExtUtils::MakeMaker build configuration
7//!
8//! # Implementation Notes
9//! - Format: Perl script with WriteMakefile() or WriteMakefile1() calls
10//! - Spec: https://metacpan.org/pod/ExtUtils::MakeMaker
11//! - Extracts: NAME, VERSION, AUTHOR, LICENSE, ABSTRACT, PREREQ_PM, BUILD_REQUIRES, TEST_REQUIRES, CONFIGURE_REQUIRES
12//! - Uses regex-based extraction (no Perl code execution for security)
13//! - Python reference has stub-only handler with no parse() method - this is BEYOND PARITY
14
15use std::collections::HashMap;
16use std::path::Path;
17use std::sync::LazyLock;
18
19use crate::parser_warn as warn;
20use crate::parsers::utils::{MAX_ITERATION_COUNT, read_file_to_string, truncate_field};
21use packageurl::PackageUrl;
22use regex::Regex;
23use serde_json::json;
24
25use crate::models::{DatasourceId, Dependency, PackageData, PackageType, Party};
26
27use super::PackageParser;
28use super::license_normalization::{
29    DeclaredLicenseMatchMetadata, NormalizedDeclaredLicense, build_declared_license_data,
30    empty_declared_license_data, normalize_declared_license_key, normalize_spdx_expression,
31};
32
33static RE_WRITEMAKEFILE: LazyLock<Regex> = LazyLock::new(|| {
34    Regex::new(r"WriteMakefile1?\s*\(").expect("valid regex: WriteMakefile call pattern")
35});
36static RE_SIMPLE_KV: LazyLock<Regex> = LazyLock::new(|| {
37    Regex::new(r#"(?m)^\s*([A-Z_]+)\s*=>\s*(?:'([^']*)'|"([^"]*)"|q\{([^}]*)\}|q\(([^)]*)\))"#)
38        .expect("valid regex: simple key=>value pattern")
39});
40static RE_HASH_BLOCK: LazyLock<Regex> = LazyLock::new(|| {
41    Regex::new(r"([A-Z_]+)\s*=>\s*\{([^}]*)\}").expect("valid regex: hash block pattern")
42});
43static RE_AUTHOR_ARRAY: LazyLock<Regex> = LazyLock::new(|| {
44    Regex::new(r"AUTHOR\s*=>\s*\[([^\]]*)\]").expect("valid regex: AUTHOR array pattern")
45});
46static RE_QUOTED_STRING: LazyLock<Regex> = LazyLock::new(|| {
47    Regex::new(r#"['"]([^'"]*)['"']"#).expect("valid regex: quoted string pattern")
48});
49static RE_DEP_PAIR: LazyLock<Regex> = LazyLock::new(|| {
50    Regex::new(r#"['"]([^'"]+)['"]\s*=>\s*(?:'([^']*)'|"([^"]*)"|(\d+))"#)
51        .expect("valid regex: dependency pair pattern")
52});
53static RE_VERSION_ASSIGNMENT: LazyLock<Regex> = LazyLock::new(|| {
54    Regex::new(
55        r#"(?m)^\s*(?:our\s+)?\$(?:[A-Za-z_][\w:]*::)?VERSION\s*=\s*(?:'([^']+)'|"([^"]+)")"#,
56    )
57    .expect("valid regex: VERSION assignment pattern")
58});
59
60const PACKAGE_TYPE: PackageType = PackageType::Cpan;
61const MAX_METADATA_FILE_SIZE: u64 = 1024 * 1024;
62
63pub struct CpanMakefilePlParser;
64
65impl PackageParser for CpanMakefilePlParser {
66    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
67
68    fn is_match(path: &Path) -> bool {
69        path.file_name().is_some_and(|name| name == "Makefile.PL")
70    }
71
72    fn extract_packages(path: &Path) -> Vec<PackageData> {
73        let content = match read_file_to_string(path, None) {
74            Ok(c) => c,
75            Err(e) => {
76                warn!("Failed to read Makefile.PL file {:?}: {}", path, e);
77                return vec![PackageData {
78                    package_type: Some(PACKAGE_TYPE),
79                    primary_language: Some("Perl".to_string()),
80                    datasource_id: Some(DatasourceId::CpanMakefile),
81                    ..Default::default()
82                }];
83            }
84        };
85
86        vec![parse_makefile_pl_with_base(&content, path.parent())]
87    }
88}
89
90#[cfg(test)]
91pub(crate) fn parse_makefile_pl(content: &str) -> PackageData {
92    parse_makefile_pl_with_base(content, None)
93}
94
95pub(crate) fn parse_makefile_pl_with_base(content: &str, base_dir: Option<&Path>) -> PackageData {
96    // Find WriteMakefile or WriteMakefile1 call
97    let makefile_block = extract_writemakefile_block(content);
98    if makefile_block.is_empty() {
99        return default_package_data();
100    }
101
102    let fields = parse_hash_fields(&makefile_block);
103
104    let name = fields.get("NAME").and_then(|n| sanitize_scalar_field(n));
105    let resolved_metadata = resolve_referenced_metadata(&fields, base_dir);
106
107    let version = fields
108        .get("VERSION")
109        .and_then(|v| sanitize_scalar_field(v))
110        .or_else(|| resolved_metadata.version.clone());
111    let description = fields
112        .get("ABSTRACT")
113        .and_then(|d| sanitize_scalar_field(d))
114        .or_else(|| resolved_metadata.abstract_text.clone());
115    let extracted_license_statement = fields.get("LICENSE").and_then(|l| sanitize_scalar_field(l));
116    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
117        extracted_license_statement
118            .as_deref()
119            .and_then(normalize_cpan_makefile_license)
120            .map(|normalized| {
121                build_declared_license_data(
122                    normalized,
123                    DeclaredLicenseMatchMetadata::single_line(
124                        extracted_license_statement.as_deref().unwrap_or_default(),
125                    ),
126                )
127            })
128            .unwrap_or_else(empty_declared_license_data);
129
130    let parties = parse_author(&fields);
131    let dependencies = parse_dependencies(&fields);
132
133    let mut extra_data = HashMap::new();
134    if let Some(min_perl) = fields
135        .get("MIN_PERL_VERSION")
136        .and_then(|value| sanitize_scalar_field(value))
137    {
138        extra_data.insert("MIN_PERL_VERSION".to_string(), json!(min_perl));
139    }
140    if let Some(version_from) = fields
141        .get("VERSION_FROM")
142        .and_then(|value| sanitize_scalar_field(value))
143    {
144        extra_data.insert("VERSION_FROM".to_string(), json!(version_from));
145    }
146    if let Some(abstract_from) = fields
147        .get("ABSTRACT_FROM")
148        .and_then(|value| sanitize_scalar_field(value))
149    {
150        extra_data.insert("ABSTRACT_FROM".to_string(), json!(abstract_from));
151    }
152
153    // Build PURL: convert Foo::Bar to Foo-Bar for CPAN naming convention
154    let purl = name.as_ref().and_then(|n| {
155        let purl_name = n.replace("::", "-");
156        PackageUrl::new("cpan", &purl_name).ok().map(|mut p| {
157            if let Some(v) = &version {
158                let _ = p.with_version(v).ok();
159            }
160            p.to_string()
161        })
162    });
163
164    PackageData {
165        package_type: Some(PACKAGE_TYPE),
166        namespace: Some("cpan".to_string()),
167        name,
168        version,
169        description,
170        declared_license_expression,
171        declared_license_expression_spdx,
172        license_detections,
173        extracted_license_statement,
174        parties,
175        dependencies,
176        extra_data: if extra_data.is_empty() {
177            None
178        } else {
179            Some(extra_data)
180        },
181        purl,
182        datasource_id: Some(DatasourceId::CpanMakefile),
183        primary_language: Some("Perl".to_string()),
184        ..Default::default()
185    }
186}
187
188#[derive(Default)]
189struct ResolvedMetadata {
190    version: Option<String>,
191    abstract_text: Option<String>,
192}
193
194fn default_package_data() -> PackageData {
195    PackageData {
196        package_type: Some(PACKAGE_TYPE),
197        primary_language: Some("Perl".to_string()),
198        datasource_id: Some(DatasourceId::CpanMakefile),
199        ..Default::default()
200    }
201}
202
203fn normalize_cpan_makefile_license(value: &str) -> Option<NormalizedDeclaredLicense> {
204    match value.trim() {
205        "perl_5" | "Perl_5" => Some(NormalizedDeclaredLicense::new(
206            "gpl-1.0-plus OR artistic-perl-1.0",
207            "GPL-1.0-or-later OR Artistic-1.0-Perl",
208        )),
209        "artistic_2" => Some(NormalizedDeclaredLicense::new(
210            "artistic-2.0",
211            "Artistic-2.0",
212        )),
213        "apache_2_0" => Some(NormalizedDeclaredLicense::new("apache-2.0", "Apache-2.0")),
214        other => normalize_spdx_expression(other).or_else(|| normalize_declared_license_key(other)),
215    }
216}
217
218fn sanitize_scalar_field(value: &str) -> Option<String> {
219    let trimmed = value.trim();
220    if trimmed.is_empty() || looks_like_unresolved_template_value(trimmed) {
221        return None;
222    }
223
224    Some(truncate_field(trimmed.to_string()))
225}
226
227fn looks_like_unresolved_template_value(value: &str) -> bool {
228    let trimmed = value.trim();
229    let uppercase = trimmed.to_ascii_uppercase();
230
231    trimmed.contains("[%")
232        || trimmed.contains("%]")
233        || trimmed.contains("<%")
234        || trimmed.contains("%>")
235        || (trimmed.contains("{{") && trimmed.contains("}}"))
236        || trimmed.contains("${{")
237        || trimmed.contains("[d2%")
238        || trimmed.contains("%2d]")
239        || matches!(
240            uppercase.as_str(),
241            "YOUR NAME" | "YOUR APPLICATION ABSTRACT" | "YOUREMAIL@EXAMPLE.COM"
242        )
243}
244
245fn resolve_referenced_metadata(
246    fields: &HashMap<String, String>,
247    base_dir: Option<&Path>,
248) -> ResolvedMetadata {
249    let Some(base_dir) = base_dir else {
250        return ResolvedMetadata::default();
251    };
252
253    let mut resolved = ResolvedMetadata::default();
254    let mut cache: HashMap<String, Option<String>> = HashMap::new();
255
256    if let Some(version_from) = fields.get("VERSION_FROM")
257        && !looks_like_unresolved_template_value(version_from)
258        && let Some(content) = load_referenced_metadata_file(base_dir, version_from, &mut cache)
259    {
260        resolved.version = extract_version_from_module_content(content);
261    }
262
263    if let Some(abstract_from) = fields.get("ABSTRACT_FROM")
264        && !looks_like_unresolved_template_value(abstract_from)
265        && let Some(content) = load_referenced_metadata_file(base_dir, abstract_from, &mut cache)
266    {
267        resolved.abstract_text = extract_abstract_from_module_content(content);
268    }
269
270    resolved
271}
272
273fn load_referenced_metadata_file<'a>(
274    base_dir: &Path,
275    relative_path: &str,
276    cache: &'a mut HashMap<String, Option<String>>,
277) -> Option<&'a String> {
278    let entry = cache
279        .entry(relative_path.to_string())
280        .or_insert_with(|| read_safe_metadata_file(base_dir, relative_path));
281    entry.as_ref()
282}
283
284fn read_safe_metadata_file(base_dir: &Path, relative_path: &str) -> Option<String> {
285    let ref_path = Path::new(relative_path);
286    if ref_path.is_absolute() {
287        return None;
288    }
289
290    let base_dir = base_dir.canonicalize().ok()?;
291    let candidate = base_dir.join(ref_path);
292    let canonical_candidate = candidate.canonicalize().ok()?;
293    if !canonical_candidate.starts_with(&base_dir) {
294        return None;
295    }
296
297    let metadata = std::fs::metadata(&canonical_candidate).ok()?;
298    if !metadata.is_file() || metadata.len() > MAX_METADATA_FILE_SIZE {
299        return None;
300    }
301
302    read_file_to_string(&canonical_candidate, None).ok()
303}
304
305fn extract_version_from_module_content(content: &str) -> Option<String> {
306    RE_VERSION_ASSIGNMENT
307        .captures(content)
308        .and_then(|caps| caps.get(1).or_else(|| caps.get(2)))
309        .map(|m| m.as_str().trim().to_string())
310        .map(truncate_field)
311        .filter(|value| !value.is_empty())
312}
313
314fn extract_abstract_from_module_content(content: &str) -> Option<String> {
315    let mut in_name_section = false;
316
317    for line in content.lines() {
318        let trimmed = line.trim();
319        if trimmed == "=head1 NAME" {
320            in_name_section = true;
321            continue;
322        }
323
324        if in_name_section {
325            if trimmed.starts_with('=') {
326                break;
327            }
328            if trimmed.is_empty() {
329                continue;
330            }
331
332            if let Some((_, abstract_text)) = trimmed.split_once(" - ") {
333                let abstract_text = abstract_text.trim();
334                if !abstract_text.is_empty() {
335                    return Some(truncate_field(abstract_text.to_string()));
336                }
337            }
338        }
339    }
340
341    None
342}
343
344fn extract_writemakefile_block(content: &str) -> String {
345    let start_match = match RE_WRITEMAKEFILE.find(content) {
346        Some(m) => m,
347        None => return String::new(),
348    };
349
350    let start_pos = start_match.end();
351    let content_from_start = &content[start_pos..];
352
353    // Find the matching closing parenthesis
354    let mut depth = 1;
355    let mut end_pos = 0;
356    let chars: Vec<char> = content_from_start.chars().collect();
357
358    for (i, &ch) in chars.iter().enumerate() {
359        if i >= MAX_ITERATION_COUNT {
360            break;
361        }
362        match ch {
363            '(' => depth += 1,
364            ')' => {
365                depth -= 1;
366                if depth == 0 {
367                    end_pos = i;
368                    break;
369                }
370            }
371            _ => {}
372        }
373    }
374
375    if end_pos > 0 {
376        content_from_start[..end_pos].to_string()
377    } else {
378        String::new()
379    }
380}
381
382fn parse_hash_fields(content: &str) -> HashMap<String, String> {
383    let mut fields = HashMap::new();
384
385    for cap in RE_SIMPLE_KV
386        .captures_iter(content)
387        .take(MAX_ITERATION_COUNT)
388    {
389        let key = cap.get(1).map(|m| m.as_str()).unwrap_or("").to_string();
390        let value = cap
391            .get(2)
392            .or_else(|| cap.get(3))
393            .or_else(|| cap.get(4))
394            .or_else(|| cap.get(5))
395            .map(|m| m.as_str().to_string());
396
397        if let Some(v) = value {
398            fields.insert(key, v);
399        }
400    }
401
402    // Parse hash values (PREREQ_PM, BUILD_REQUIRES, etc.)
403    parse_hash_dependencies(content, &mut fields);
404
405    // Parse array refs for AUTHOR
406    parse_author_array(content, &mut fields);
407
408    fields
409}
410
411fn parse_hash_dependencies(content: &str, fields: &mut HashMap<String, String>) {
412    for cap in RE_HASH_BLOCK
413        .captures_iter(content)
414        .take(MAX_ITERATION_COUNT)
415    {
416        let key = cap.get(1).map(|m| m.as_str()).unwrap_or("");
417        let hash_content = cap.get(2).map(|m| m.as_str()).unwrap_or("");
418
419        // For dependency hashes, we'll store them with a special marker
420        // so parse_dependencies can find them
421        if matches!(
422            key,
423            "PREREQ_PM" | "BUILD_REQUIRES" | "TEST_REQUIRES" | "CONFIGURE_REQUIRES"
424        ) {
425            fields.insert(format!("_HASH_{}", key), hash_content.to_string());
426        }
427    }
428}
429
430fn parse_author_array(content: &str, fields: &mut HashMap<String, String>) {
431    if let Some(cap) = RE_AUTHOR_ARRAY.captures(content) {
432        let array_content = cap.get(1).map(|m| m.as_str()).unwrap_or("");
433
434        let authors: Vec<String> = RE_QUOTED_STRING
435            .captures_iter(array_content)
436            .take(MAX_ITERATION_COUNT)
437            .filter_map(|c| c.get(1).map(|m| m.as_str().to_string()))
438            .collect();
439
440        if !authors.is_empty() {
441            // Store as JSON array for later processing
442            fields.insert("_ARRAY_AUTHOR".to_string(), authors.join("||"));
443        }
444    }
445}
446
447fn parse_author(fields: &HashMap<String, String>) -> Vec<Party> {
448    // Check for array of authors first
449    if let Some(authors_str) = fields.get("_ARRAY_AUTHOR") {
450        return authors_str
451            .split("||")
452            .filter_map(|author_str| {
453                if author_str.trim().is_empty() {
454                    return None;
455                }
456                let (name, email) = parse_author_string(author_str);
457                build_author_party(name, email)
458            })
459            .collect();
460    }
461
462    if let Some(author_str) = fields.get("AUTHOR") {
463        let (name, email) = parse_author_string(author_str);
464        return build_author_party(name, email).into_iter().collect();
465    }
466
467    Vec::new()
468}
469
470fn build_author_party(name: Option<String>, email: Option<String>) -> Option<Party> {
471    if name.is_none() && email.is_none() {
472        return None;
473    }
474
475    Some(Party {
476        role: Some("author".to_string()),
477        name,
478        email,
479        r#type: Some("person".to_string()),
480        url: None,
481        organization: None,
482        organization_url: None,
483        timezone: None,
484    })
485}
486
487fn parse_author_string(s: &str) -> (Option<String>, Option<String>) {
488    if let Some(start) = s.find('<')
489        && let Some(end) = s.find('>')
490        && start < end
491    {
492        let name = s[..start].trim();
493        let email = s[start + 1..end].trim();
494        return (sanitize_scalar_field(name), sanitize_scalar_field(email));
495    }
496    (sanitize_scalar_field(s), None)
497}
498
499fn parse_dependencies(fields: &HashMap<String, String>) -> Vec<Dependency> {
500    let mut dependencies = Vec::new();
501
502    // Parse PREREQ_PM as runtime dependencies
503    if let Some(hash_content) = fields.get("_HASH_PREREQ_PM") {
504        dependencies.extend(extract_deps_from_hash(hash_content, "runtime", true));
505    }
506
507    // Parse BUILD_REQUIRES
508    if let Some(hash_content) = fields.get("_HASH_BUILD_REQUIRES") {
509        dependencies.extend(extract_deps_from_hash(hash_content, "build", false));
510    }
511
512    // Parse TEST_REQUIRES
513    if let Some(hash_content) = fields.get("_HASH_TEST_REQUIRES") {
514        dependencies.extend(extract_deps_from_hash(hash_content, "test", false));
515    }
516
517    // Parse CONFIGURE_REQUIRES
518    if let Some(hash_content) = fields.get("_HASH_CONFIGURE_REQUIRES") {
519        dependencies.extend(extract_deps_from_hash(hash_content, "configure", false));
520    }
521
522    dependencies
523}
524
525fn extract_deps_from_hash(hash_content: &str, scope: &str, is_runtime: bool) -> Vec<Dependency> {
526    let mut deps = Vec::new();
527
528    for cap in RE_DEP_PAIR
529        .captures_iter(hash_content)
530        .take(MAX_ITERATION_COUNT)
531    {
532        let module_name = cap.get(1).map(|m| m.as_str()).unwrap_or("");
533
534        // Skip perl itself
535        if module_name == "perl" {
536            continue;
537        }
538
539        let version = cap
540            .get(2)
541            .or_else(|| cap.get(3))
542            .or_else(|| cap.get(4))
543            .map(|m| m.as_str());
544
545        let extracted_requirement = match version {
546            Some("0") | Some("") | None => None,
547            Some(v) => Some(truncate_field(v.to_string())),
548        };
549
550        let purl = PackageUrl::new("cpan", module_name)
551            .ok()
552            .map(|p| p.to_string());
553
554        deps.push(Dependency {
555            purl,
556            extracted_requirement,
557            scope: Some(truncate_field(scope.to_string())),
558            is_runtime: Some(is_runtime),
559            is_optional: Some(false),
560            is_pinned: None,
561            is_direct: Some(true),
562            resolved_package: None,
563            extra_data: None,
564        });
565    }
566
567    deps
568}
569
570crate::register_parser!(
571    "CPAN Perl Makefile.PL",
572    &["*/Makefile.PL"],
573    "cpan",
574    "Perl",
575    Some("https://metacpan.org/pod/ExtUtils::MakeMaker"),
576);