Skip to main content

provenant/parsers/
cpan_makefile_pl.rs

1//! Parser for CPAN Perl Makefile.PL files.
2//!
3//! Extracts Perl package metadata from `Makefile.PL` files used by ExtUtils::MakeMaker.
4//!
5//! # Supported Formats
6//! - `Makefile.PL` - CPAN ExtUtils::MakeMaker build configuration
7//!
8//! # Implementation Notes
9//! - Format: Perl script with WriteMakefile() or WriteMakefile1() calls
10//! - Spec: https://metacpan.org/pod/ExtUtils::MakeMaker
11//! - Extracts: NAME, VERSION, AUTHOR, LICENSE, ABSTRACT, PREREQ_PM, BUILD_REQUIRES, TEST_REQUIRES, CONFIGURE_REQUIRES
12//! - Uses regex-based extraction (no Perl code execution for security)
13//! - Python reference has stub-only handler with no parse() method - this is BEYOND PARITY
14
15use std::collections::HashMap;
16use std::path::Path;
17use std::sync::LazyLock;
18
19use crate::parser_warn as warn;
20use crate::parsers::utils::{MAX_ITERATION_COUNT, read_file_to_string, truncate_field};
21use packageurl::PackageUrl;
22use regex::Regex;
23use serde_json::json;
24
25use crate::models::{DatasourceId, Dependency, PackageData, PackageType, Party};
26
27use super::PackageParser;
28use super::license_normalization::{
29    DeclaredLicenseMatchMetadata, NormalizedDeclaredLicense, build_declared_license_data,
30    empty_declared_license_data, normalize_declared_license_key, normalize_spdx_expression,
31};
32
33static RE_WRITEMAKEFILE: LazyLock<Regex> =
34    LazyLock::new(|| Regex::new(r"WriteMakefile1?\s*\(").unwrap());
35static RE_SIMPLE_KV: LazyLock<Regex> = LazyLock::new(|| {
36    Regex::new(r#"(?m)^\s*([A-Z_]+)\s*=>\s*(?:'([^']*)'|"([^"]*)"|q\{([^}]*)\}|q\(([^)]*)\))"#)
37        .unwrap()
38});
39static RE_HASH_BLOCK: LazyLock<Regex> =
40    LazyLock::new(|| Regex::new(r"([A-Z_]+)\s*=>\s*\{([^}]*)\}").unwrap());
41static RE_AUTHOR_ARRAY: LazyLock<Regex> =
42    LazyLock::new(|| Regex::new(r"AUTHOR\s*=>\s*\[([^\]]*)\]").unwrap());
43static RE_QUOTED_STRING: LazyLock<Regex> =
44    LazyLock::new(|| Regex::new(r#"['"]([^'"]*)['"']"#).unwrap());
45static RE_DEP_PAIR: LazyLock<Regex> = LazyLock::new(|| {
46    Regex::new(r#"['"]([^'"]+)['"]\s*=>\s*(?:'([^']*)'|"([^"]*)"|(\d+))"#).unwrap()
47});
48static RE_VERSION_ASSIGNMENT: LazyLock<Regex> = LazyLock::new(|| {
49    Regex::new(
50        r#"(?m)^\s*(?:our\s+)?\$(?:[A-Za-z_][\w:]*::)?VERSION\s*=\s*(?:'([^']+)'|"([^"]+)")"#,
51    )
52    .unwrap()
53});
54
55const PACKAGE_TYPE: PackageType = PackageType::Cpan;
56const MAX_METADATA_FILE_SIZE: u64 = 1024 * 1024;
57
58pub struct CpanMakefilePlParser;
59
60impl PackageParser for CpanMakefilePlParser {
61    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
62
63    fn is_match(path: &Path) -> bool {
64        path.file_name().is_some_and(|name| name == "Makefile.PL")
65    }
66
67    fn extract_packages(path: &Path) -> Vec<PackageData> {
68        let content = match read_file_to_string(path, None) {
69            Ok(c) => c,
70            Err(e) => {
71                warn!("Failed to read Makefile.PL file {:?}: {}", path, e);
72                return vec![PackageData {
73                    package_type: Some(PACKAGE_TYPE),
74                    primary_language: Some("Perl".to_string()),
75                    datasource_id: Some(DatasourceId::CpanMakefile),
76                    ..Default::default()
77                }];
78            }
79        };
80
81        vec![parse_makefile_pl_with_base(&content, path.parent())]
82    }
83}
84
85#[cfg(test)]
86pub(crate) fn parse_makefile_pl(content: &str) -> PackageData {
87    parse_makefile_pl_with_base(content, None)
88}
89
90pub(crate) fn parse_makefile_pl_with_base(content: &str, base_dir: Option<&Path>) -> PackageData {
91    // Find WriteMakefile or WriteMakefile1 call
92    let makefile_block = extract_writemakefile_block(content);
93    if makefile_block.is_empty() {
94        return default_package_data();
95    }
96
97    let fields = parse_hash_fields(&makefile_block);
98
99    let name = fields.get("NAME").map(|n| truncate_field(n.to_string()));
100    let resolved_metadata = resolve_referenced_metadata(&fields, base_dir);
101
102    let version = fields
103        .get("VERSION")
104        .map(|v| truncate_field(v.to_string()))
105        .or_else(|| resolved_metadata.version.clone());
106    let description = fields
107        .get("ABSTRACT")
108        .map(|d| truncate_field(d.to_string()))
109        .or_else(|| resolved_metadata.abstract_text.clone());
110    let extracted_license_statement = fields.get("LICENSE").map(|l| truncate_field(l.to_string()));
111    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
112        extracted_license_statement
113            .as_deref()
114            .and_then(normalize_cpan_makefile_license)
115            .map(|normalized| {
116                build_declared_license_data(
117                    normalized,
118                    DeclaredLicenseMatchMetadata::single_line(
119                        extracted_license_statement.as_deref().unwrap_or_default(),
120                    ),
121                )
122            })
123            .unwrap_or_else(empty_declared_license_data);
124
125    let parties = parse_author(&fields);
126    let dependencies = parse_dependencies(&fields);
127
128    let mut extra_data = HashMap::new();
129    if let Some(min_perl) = fields.get("MIN_PERL_VERSION") {
130        extra_data.insert(
131            "MIN_PERL_VERSION".to_string(),
132            json!(truncate_field(min_perl.to_string())),
133        );
134    }
135    if let Some(version_from) = fields.get("VERSION_FROM") {
136        extra_data.insert(
137            "VERSION_FROM".to_string(),
138            json!(truncate_field(version_from.to_string())),
139        );
140    }
141    if let Some(abstract_from) = fields.get("ABSTRACT_FROM") {
142        extra_data.insert(
143            "ABSTRACT_FROM".to_string(),
144            json!(truncate_field(abstract_from.to_string())),
145        );
146    }
147
148    // Build PURL: convert Foo::Bar to Foo-Bar for CPAN naming convention
149    let purl = name.as_ref().and_then(|n| {
150        let purl_name = n.replace("::", "-");
151        PackageUrl::new("cpan", &purl_name).ok().map(|mut p| {
152            if let Some(v) = &version {
153                let _ = p.with_version(v).ok();
154            }
155            p.to_string()
156        })
157    });
158
159    PackageData {
160        package_type: Some(PACKAGE_TYPE),
161        namespace: Some("cpan".to_string()),
162        name,
163        version,
164        description,
165        declared_license_expression,
166        declared_license_expression_spdx,
167        license_detections,
168        extracted_license_statement,
169        parties,
170        dependencies,
171        extra_data: if extra_data.is_empty() {
172            None
173        } else {
174            Some(extra_data)
175        },
176        purl,
177        datasource_id: Some(DatasourceId::CpanMakefile),
178        primary_language: Some("Perl".to_string()),
179        ..Default::default()
180    }
181}
182
183#[derive(Default)]
184struct ResolvedMetadata {
185    version: Option<String>,
186    abstract_text: Option<String>,
187}
188
189fn default_package_data() -> PackageData {
190    PackageData {
191        package_type: Some(PACKAGE_TYPE),
192        primary_language: Some("Perl".to_string()),
193        datasource_id: Some(DatasourceId::CpanMakefile),
194        ..Default::default()
195    }
196}
197
198fn normalize_cpan_makefile_license(value: &str) -> Option<NormalizedDeclaredLicense> {
199    match value.trim() {
200        "perl_5" | "Perl_5" => Some(NormalizedDeclaredLicense::new(
201            "gpl-1.0-plus OR artistic-perl-1.0",
202            "GPL-1.0-or-later OR Artistic-1.0-Perl",
203        )),
204        "artistic_2" => Some(NormalizedDeclaredLicense::new(
205            "artistic-2.0",
206            "Artistic-2.0",
207        )),
208        "apache_2_0" => Some(NormalizedDeclaredLicense::new("apache-2.0", "Apache-2.0")),
209        other => normalize_spdx_expression(other).or_else(|| normalize_declared_license_key(other)),
210    }
211}
212
213fn resolve_referenced_metadata(
214    fields: &HashMap<String, String>,
215    base_dir: Option<&Path>,
216) -> ResolvedMetadata {
217    let Some(base_dir) = base_dir else {
218        return ResolvedMetadata::default();
219    };
220
221    let mut resolved = ResolvedMetadata::default();
222    let mut cache: HashMap<String, Option<String>> = HashMap::new();
223
224    if let Some(version_from) = fields.get("VERSION_FROM")
225        && let Some(content) = load_referenced_metadata_file(base_dir, version_from, &mut cache)
226    {
227        resolved.version = extract_version_from_module_content(content);
228    }
229
230    if let Some(abstract_from) = fields.get("ABSTRACT_FROM")
231        && let Some(content) = load_referenced_metadata_file(base_dir, abstract_from, &mut cache)
232    {
233        resolved.abstract_text = extract_abstract_from_module_content(content);
234    }
235
236    resolved
237}
238
239fn load_referenced_metadata_file<'a>(
240    base_dir: &Path,
241    relative_path: &str,
242    cache: &'a mut HashMap<String, Option<String>>,
243) -> Option<&'a String> {
244    let entry = cache
245        .entry(relative_path.to_string())
246        .or_insert_with(|| read_safe_metadata_file(base_dir, relative_path));
247    entry.as_ref()
248}
249
250fn read_safe_metadata_file(base_dir: &Path, relative_path: &str) -> Option<String> {
251    let ref_path = Path::new(relative_path);
252    if ref_path.is_absolute() {
253        return None;
254    }
255
256    let base_dir = base_dir.canonicalize().ok()?;
257    let candidate = base_dir.join(ref_path);
258    let canonical_candidate = candidate.canonicalize().ok()?;
259    if !canonical_candidate.starts_with(&base_dir) {
260        return None;
261    }
262
263    let metadata = std::fs::metadata(&canonical_candidate).ok()?;
264    if !metadata.is_file() || metadata.len() > MAX_METADATA_FILE_SIZE {
265        return None;
266    }
267
268    read_file_to_string(&canonical_candidate, None).ok()
269}
270
271fn extract_version_from_module_content(content: &str) -> Option<String> {
272    RE_VERSION_ASSIGNMENT
273        .captures(content)
274        .and_then(|caps| caps.get(1).or_else(|| caps.get(2)))
275        .map(|m| m.as_str().trim().to_string())
276        .map(truncate_field)
277        .filter(|value| !value.is_empty())
278}
279
280fn extract_abstract_from_module_content(content: &str) -> Option<String> {
281    let mut in_name_section = false;
282
283    for line in content.lines() {
284        let trimmed = line.trim();
285        if trimmed == "=head1 NAME" {
286            in_name_section = true;
287            continue;
288        }
289
290        if in_name_section {
291            if trimmed.starts_with('=') {
292                break;
293            }
294            if trimmed.is_empty() {
295                continue;
296            }
297
298            if let Some((_, abstract_text)) = trimmed.split_once(" - ") {
299                let abstract_text = abstract_text.trim();
300                if !abstract_text.is_empty() {
301                    return Some(truncate_field(abstract_text.to_string()));
302                }
303            }
304        }
305    }
306
307    None
308}
309
310fn extract_writemakefile_block(content: &str) -> String {
311    let start_match = match RE_WRITEMAKEFILE.find(content) {
312        Some(m) => m,
313        None => return String::new(),
314    };
315
316    let start_pos = start_match.end();
317    let content_from_start = &content[start_pos..];
318
319    // Find the matching closing parenthesis
320    let mut depth = 1;
321    let mut end_pos = 0;
322    let chars: Vec<char> = content_from_start.chars().collect();
323
324    for (i, &ch) in chars.iter().enumerate() {
325        if i >= MAX_ITERATION_COUNT {
326            break;
327        }
328        match ch {
329            '(' => depth += 1,
330            ')' => {
331                depth -= 1;
332                if depth == 0 {
333                    end_pos = i;
334                    break;
335                }
336            }
337            _ => {}
338        }
339    }
340
341    if end_pos > 0 {
342        content_from_start[..end_pos].to_string()
343    } else {
344        String::new()
345    }
346}
347
348fn parse_hash_fields(content: &str) -> HashMap<String, String> {
349    let mut fields = HashMap::new();
350
351    for cap in RE_SIMPLE_KV
352        .captures_iter(content)
353        .take(MAX_ITERATION_COUNT)
354    {
355        let key = cap.get(1).map(|m| m.as_str()).unwrap_or("").to_string();
356        let value = cap
357            .get(2)
358            .or_else(|| cap.get(3))
359            .or_else(|| cap.get(4))
360            .or_else(|| cap.get(5))
361            .map(|m| m.as_str().to_string());
362
363        if let Some(v) = value {
364            fields.insert(key, v);
365        }
366    }
367
368    // Parse hash values (PREREQ_PM, BUILD_REQUIRES, etc.)
369    parse_hash_dependencies(content, &mut fields);
370
371    // Parse array refs for AUTHOR
372    parse_author_array(content, &mut fields);
373
374    fields
375}
376
377fn parse_hash_dependencies(content: &str, fields: &mut HashMap<String, String>) {
378    for cap in RE_HASH_BLOCK
379        .captures_iter(content)
380        .take(MAX_ITERATION_COUNT)
381    {
382        let key = cap.get(1).map(|m| m.as_str()).unwrap_or("");
383        let hash_content = cap.get(2).map(|m| m.as_str()).unwrap_or("");
384
385        // For dependency hashes, we'll store them with a special marker
386        // so parse_dependencies can find them
387        if matches!(
388            key,
389            "PREREQ_PM" | "BUILD_REQUIRES" | "TEST_REQUIRES" | "CONFIGURE_REQUIRES"
390        ) {
391            fields.insert(format!("_HASH_{}", key), hash_content.to_string());
392        }
393    }
394}
395
396fn parse_author_array(content: &str, fields: &mut HashMap<String, String>) {
397    if let Some(cap) = RE_AUTHOR_ARRAY.captures(content) {
398        let array_content = cap.get(1).map(|m| m.as_str()).unwrap_or("");
399
400        let authors: Vec<String> = RE_QUOTED_STRING
401            .captures_iter(array_content)
402            .take(MAX_ITERATION_COUNT)
403            .filter_map(|c| c.get(1).map(|m| m.as_str().to_string()))
404            .collect();
405
406        if !authors.is_empty() {
407            // Store as JSON array for later processing
408            fields.insert("_ARRAY_AUTHOR".to_string(), authors.join("||"));
409        }
410    }
411}
412
413fn parse_author(fields: &HashMap<String, String>) -> Vec<Party> {
414    // Check for array of authors first
415    if let Some(authors_str) = fields.get("_ARRAY_AUTHOR") {
416        return authors_str
417            .split("||")
418            .filter_map(|author_str| {
419                if author_str.trim().is_empty() {
420                    return None;
421                }
422                let (name, email) = parse_author_string(author_str);
423                Some(Party {
424                    role: Some("author".to_string()),
425                    name,
426                    email,
427                    r#type: Some("person".to_string()),
428                    url: None,
429                    organization: None,
430                    organization_url: None,
431                    timezone: None,
432                })
433            })
434            .collect();
435    }
436
437    if let Some(author_str) = fields.get("AUTHOR") {
438        let (name, email) = parse_author_string(author_str);
439        return vec![Party {
440            role: Some("author".to_string()),
441            name,
442            email,
443            r#type: Some("person".to_string()),
444            url: None,
445            organization: None,
446            organization_url: None,
447            timezone: None,
448        }];
449    }
450
451    Vec::new()
452}
453
454fn parse_author_string(s: &str) -> (Option<String>, Option<String>) {
455    if let Some(start) = s.find('<')
456        && let Some(end) = s.find('>')
457        && start < end
458    {
459        let name = s[..start].trim();
460        let email = s[start + 1..end].trim();
461        return (
462            if name.is_empty() {
463                None
464            } else {
465                Some(truncate_field(name.to_string()))
466            },
467            if email.is_empty() {
468                None
469            } else {
470                Some(truncate_field(email.to_string()))
471            },
472        );
473    }
474    (Some(truncate_field(s.trim().to_string())), None)
475}
476
477fn parse_dependencies(fields: &HashMap<String, String>) -> Vec<Dependency> {
478    let mut dependencies = Vec::new();
479
480    // Parse PREREQ_PM as runtime dependencies
481    if let Some(hash_content) = fields.get("_HASH_PREREQ_PM") {
482        dependencies.extend(extract_deps_from_hash(hash_content, "runtime", true));
483    }
484
485    // Parse BUILD_REQUIRES
486    if let Some(hash_content) = fields.get("_HASH_BUILD_REQUIRES") {
487        dependencies.extend(extract_deps_from_hash(hash_content, "build", false));
488    }
489
490    // Parse TEST_REQUIRES
491    if let Some(hash_content) = fields.get("_HASH_TEST_REQUIRES") {
492        dependencies.extend(extract_deps_from_hash(hash_content, "test", false));
493    }
494
495    // Parse CONFIGURE_REQUIRES
496    if let Some(hash_content) = fields.get("_HASH_CONFIGURE_REQUIRES") {
497        dependencies.extend(extract_deps_from_hash(hash_content, "configure", false));
498    }
499
500    dependencies
501}
502
503fn extract_deps_from_hash(hash_content: &str, scope: &str, is_runtime: bool) -> Vec<Dependency> {
504    let mut deps = Vec::new();
505
506    for cap in RE_DEP_PAIR
507        .captures_iter(hash_content)
508        .take(MAX_ITERATION_COUNT)
509    {
510        let module_name = cap.get(1).map(|m| m.as_str()).unwrap_or("");
511
512        // Skip perl itself
513        if module_name == "perl" {
514            continue;
515        }
516
517        let version = cap
518            .get(2)
519            .or_else(|| cap.get(3))
520            .or_else(|| cap.get(4))
521            .map(|m| m.as_str());
522
523        let extracted_requirement = match version {
524            Some("0") | Some("") | None => None,
525            Some(v) => Some(truncate_field(v.to_string())),
526        };
527
528        let purl = PackageUrl::new("cpan", module_name)
529            .ok()
530            .map(|p| p.to_string());
531
532        deps.push(Dependency {
533            purl,
534            extracted_requirement,
535            scope: Some(truncate_field(scope.to_string())),
536            is_runtime: Some(is_runtime),
537            is_optional: Some(false),
538            is_pinned: None,
539            is_direct: Some(true),
540            resolved_package: None,
541            extra_data: None,
542        });
543    }
544
545    deps
546}
547
548crate::register_parser!(
549    "CPAN Perl Makefile.PL",
550    &["*/Makefile.PL"],
551    "cpan",
552    "Perl",
553    Some("https://metacpan.org/pod/ExtUtils::MakeMaker"),
554);