Skip to main content

provenant/parsers/
cpan_makefile_pl.rs

1//! Parser for CPAN Perl Makefile.PL files.
2//!
3//! Extracts Perl package metadata from `Makefile.PL` files used by ExtUtils::MakeMaker.
4//!
5//! # Supported Formats
6//! - `Makefile.PL` - CPAN ExtUtils::MakeMaker build configuration
7//!
8//! # Implementation Notes
9//! - Format: Perl script with WriteMakefile() or WriteMakefile1() calls
10//! - Spec: https://metacpan.org/pod/ExtUtils::MakeMaker
11//! - Extracts: NAME, VERSION, AUTHOR, LICENSE, ABSTRACT, PREREQ_PM, BUILD_REQUIRES, TEST_REQUIRES, CONFIGURE_REQUIRES
12//! - Uses regex-based extraction (no Perl code execution for security)
13//! - Python reference has stub-only handler with no parse() method - this is BEYOND PARITY
14
15use std::collections::HashMap;
16use std::fs;
17use std::path::Path;
18use std::sync::LazyLock;
19
20use crate::parser_warn as warn;
21use packageurl::PackageUrl;
22use regex::Regex;
23use serde_json::json;
24
25use crate::models::{DatasourceId, Dependency, PackageData, PackageType, Party};
26
27use super::PackageParser;
28use super::license_normalization::{
29    DeclaredLicenseMatchMetadata, NormalizedDeclaredLicense, build_declared_license_data,
30    empty_declared_license_data, normalize_declared_license_key, normalize_spdx_expression,
31};
32
33static RE_WRITEMAKEFILE: LazyLock<Regex> =
34    LazyLock::new(|| Regex::new(r"WriteMakefile1?\s*\(").unwrap());
35static RE_SIMPLE_KV: LazyLock<Regex> = LazyLock::new(|| {
36    Regex::new(r#"(?m)^\s*([A-Z_]+)\s*=>\s*(?:'([^']*)'|"([^"]*)"|q\{([^}]*)\}|q\(([^)]*)\))"#)
37        .unwrap()
38});
39static RE_HASH_BLOCK: LazyLock<Regex> =
40    LazyLock::new(|| Regex::new(r"([A-Z_]+)\s*=>\s*\{([^}]*)\}").unwrap());
41static RE_AUTHOR_ARRAY: LazyLock<Regex> =
42    LazyLock::new(|| Regex::new(r"AUTHOR\s*=>\s*\[([^\]]*)\]").unwrap());
43static RE_QUOTED_STRING: LazyLock<Regex> =
44    LazyLock::new(|| Regex::new(r#"['"]([^'"]*)['"']"#).unwrap());
45static RE_DEP_PAIR: LazyLock<Regex> = LazyLock::new(|| {
46    Regex::new(r#"['"]([^'"]+)['"]\s*=>\s*(?:'([^']*)'|"([^"]*)"|(\d+))"#).unwrap()
47});
48static RE_VERSION_ASSIGNMENT: LazyLock<Regex> = LazyLock::new(|| {
49    Regex::new(
50        r#"(?m)^\s*(?:our\s+)?\$(?:[A-Za-z_][\w:]*::)?VERSION\s*=\s*(?:'([^']+)'|"([^"]+)")"#,
51    )
52    .unwrap()
53});
54
55const PACKAGE_TYPE: PackageType = PackageType::Cpan;
56const MAX_METADATA_FILE_SIZE: u64 = 1024 * 1024;
57
58pub struct CpanMakefilePlParser;
59
60impl PackageParser for CpanMakefilePlParser {
61    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
62
63    fn is_match(path: &Path) -> bool {
64        path.file_name().is_some_and(|name| name == "Makefile.PL")
65    }
66
67    fn extract_packages(path: &Path) -> Vec<PackageData> {
68        let content = match fs::read_to_string(path) {
69            Ok(c) => c,
70            Err(e) => {
71                warn!("Failed to read Makefile.PL file {:?}: {}", path, e);
72                return vec![PackageData {
73                    package_type: Some(PACKAGE_TYPE),
74                    primary_language: Some("Perl".to_string()),
75                    datasource_id: Some(DatasourceId::CpanMakefile),
76                    ..Default::default()
77                }];
78            }
79        };
80
81        vec![parse_makefile_pl_with_base(&content, path.parent())]
82    }
83}
84
85#[cfg(test)]
86pub(crate) fn parse_makefile_pl(content: &str) -> PackageData {
87    parse_makefile_pl_with_base(content, None)
88}
89
90pub(crate) fn parse_makefile_pl_with_base(content: &str, base_dir: Option<&Path>) -> PackageData {
91    // Find WriteMakefile or WriteMakefile1 call
92    let makefile_block = extract_writemakefile_block(content);
93    if makefile_block.is_empty() {
94        return default_package_data();
95    }
96
97    let fields = parse_hash_fields(&makefile_block);
98
99    let name = fields.get("NAME").map(|n| n.to_string());
100    let resolved_metadata = resolve_referenced_metadata(&fields, base_dir);
101
102    let version = fields
103        .get("VERSION")
104        .map(|v| v.to_string())
105        .or_else(|| resolved_metadata.version.clone());
106    let description = fields
107        .get("ABSTRACT")
108        .map(|d| d.to_string())
109        .or_else(|| resolved_metadata.abstract_text.clone());
110    let extracted_license_statement = fields.get("LICENSE").map(|l| l.to_string());
111    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
112        extracted_license_statement
113            .as_deref()
114            .and_then(normalize_cpan_makefile_license)
115            .map(|normalized| {
116                build_declared_license_data(
117                    normalized,
118                    DeclaredLicenseMatchMetadata::single_line(
119                        extracted_license_statement.as_deref().unwrap_or_default(),
120                    ),
121                )
122            })
123            .unwrap_or_else(empty_declared_license_data);
124
125    let parties = parse_author(&fields);
126    let dependencies = parse_dependencies(&fields);
127
128    let mut extra_data = HashMap::new();
129    if let Some(min_perl) = fields.get("MIN_PERL_VERSION") {
130        extra_data.insert("MIN_PERL_VERSION".to_string(), json!(min_perl));
131    }
132    if let Some(version_from) = fields.get("VERSION_FROM") {
133        extra_data.insert("VERSION_FROM".to_string(), json!(version_from));
134    }
135    if let Some(abstract_from) = fields.get("ABSTRACT_FROM") {
136        extra_data.insert("ABSTRACT_FROM".to_string(), json!(abstract_from));
137    }
138
139    // Build PURL: convert Foo::Bar to Foo-Bar for CPAN naming convention
140    let purl = name.as_ref().and_then(|n| {
141        let purl_name = n.replace("::", "-");
142        PackageUrl::new("cpan", &purl_name).ok().map(|mut p| {
143            if let Some(v) = &version {
144                let _ = p.with_version(v).ok();
145            }
146            p.to_string()
147        })
148    });
149
150    PackageData {
151        package_type: Some(PACKAGE_TYPE),
152        namespace: Some("cpan".to_string()),
153        name,
154        version,
155        description,
156        declared_license_expression,
157        declared_license_expression_spdx,
158        license_detections,
159        extracted_license_statement,
160        parties,
161        dependencies,
162        extra_data: if extra_data.is_empty() {
163            None
164        } else {
165            Some(extra_data)
166        },
167        purl,
168        datasource_id: Some(DatasourceId::CpanMakefile),
169        primary_language: Some("Perl".to_string()),
170        ..Default::default()
171    }
172}
173
174#[derive(Default)]
175struct ResolvedMetadata {
176    version: Option<String>,
177    abstract_text: Option<String>,
178}
179
180fn default_package_data() -> PackageData {
181    PackageData {
182        package_type: Some(PACKAGE_TYPE),
183        primary_language: Some("Perl".to_string()),
184        datasource_id: Some(DatasourceId::CpanMakefile),
185        ..Default::default()
186    }
187}
188
189fn normalize_cpan_makefile_license(value: &str) -> Option<NormalizedDeclaredLicense> {
190    match value.trim() {
191        "perl_5" | "Perl_5" => Some(NormalizedDeclaredLicense::new(
192            "gpl-1.0-plus OR artistic-perl-1.0",
193            "GPL-1.0-or-later OR Artistic-1.0-Perl",
194        )),
195        "artistic_2" => Some(NormalizedDeclaredLicense::new(
196            "artistic-2.0",
197            "Artistic-2.0",
198        )),
199        "apache_2_0" => Some(NormalizedDeclaredLicense::new("apache-2.0", "Apache-2.0")),
200        other => normalize_spdx_expression(other).or_else(|| normalize_declared_license_key(other)),
201    }
202}
203
204fn resolve_referenced_metadata(
205    fields: &HashMap<String, String>,
206    base_dir: Option<&Path>,
207) -> ResolvedMetadata {
208    let Some(base_dir) = base_dir else {
209        return ResolvedMetadata::default();
210    };
211
212    let mut resolved = ResolvedMetadata::default();
213    let mut cache: HashMap<String, Option<String>> = HashMap::new();
214
215    if let Some(version_from) = fields.get("VERSION_FROM")
216        && let Some(content) = load_referenced_metadata_file(base_dir, version_from, &mut cache)
217    {
218        resolved.version = extract_version_from_module_content(content);
219    }
220
221    if let Some(abstract_from) = fields.get("ABSTRACT_FROM")
222        && let Some(content) = load_referenced_metadata_file(base_dir, abstract_from, &mut cache)
223    {
224        resolved.abstract_text = extract_abstract_from_module_content(content);
225    }
226
227    resolved
228}
229
230fn load_referenced_metadata_file<'a>(
231    base_dir: &Path,
232    relative_path: &str,
233    cache: &'a mut HashMap<String, Option<String>>,
234) -> Option<&'a String> {
235    let entry = cache
236        .entry(relative_path.to_string())
237        .or_insert_with(|| read_safe_metadata_file(base_dir, relative_path));
238    entry.as_ref()
239}
240
241fn read_safe_metadata_file(base_dir: &Path, relative_path: &str) -> Option<String> {
242    let ref_path = Path::new(relative_path);
243    if ref_path.is_absolute() {
244        return None;
245    }
246
247    let base_dir = base_dir.canonicalize().ok()?;
248    let candidate = base_dir.join(ref_path);
249    let canonical_candidate = candidate.canonicalize().ok()?;
250    if !canonical_candidate.starts_with(&base_dir) {
251        return None;
252    }
253
254    let metadata = fs::metadata(&canonical_candidate).ok()?;
255    if !metadata.is_file() || metadata.len() > MAX_METADATA_FILE_SIZE {
256        return None;
257    }
258
259    fs::read_to_string(canonical_candidate).ok()
260}
261
262fn extract_version_from_module_content(content: &str) -> Option<String> {
263    RE_VERSION_ASSIGNMENT
264        .captures(content)
265        .and_then(|caps| caps.get(1).or_else(|| caps.get(2)))
266        .map(|m| m.as_str().trim().to_string())
267        .filter(|value| !value.is_empty())
268}
269
270fn extract_abstract_from_module_content(content: &str) -> Option<String> {
271    let mut in_name_section = false;
272
273    for line in content.lines() {
274        let trimmed = line.trim();
275        if trimmed == "=head1 NAME" {
276            in_name_section = true;
277            continue;
278        }
279
280        if in_name_section {
281            if trimmed.starts_with('=') {
282                break;
283            }
284            if trimmed.is_empty() {
285                continue;
286            }
287
288            if let Some((_, abstract_text)) = trimmed.split_once(" - ") {
289                let abstract_text = abstract_text.trim();
290                if !abstract_text.is_empty() {
291                    return Some(abstract_text.to_string());
292                }
293            }
294        }
295    }
296
297    None
298}
299
300fn extract_writemakefile_block(content: &str) -> String {
301    let start_match = match RE_WRITEMAKEFILE.find(content) {
302        Some(m) => m,
303        None => return String::new(),
304    };
305
306    let start_pos = start_match.end();
307    let content_from_start = &content[start_pos..];
308
309    // Find the matching closing parenthesis
310    let mut depth = 1;
311    let mut end_pos = 0;
312    let chars: Vec<char> = content_from_start.chars().collect();
313
314    for (i, &ch) in chars.iter().enumerate() {
315        match ch {
316            '(' => depth += 1,
317            ')' => {
318                depth -= 1;
319                if depth == 0 {
320                    end_pos = i;
321                    break;
322                }
323            }
324            _ => {}
325        }
326    }
327
328    if end_pos > 0 {
329        content_from_start[..end_pos].to_string()
330    } else {
331        String::new()
332    }
333}
334
335fn parse_hash_fields(content: &str) -> HashMap<String, String> {
336    let mut fields = HashMap::new();
337
338    for cap in RE_SIMPLE_KV.captures_iter(content) {
339        let key = cap
340            .get(1)
341            .expect("group 1 always exists")
342            .as_str()
343            .to_string();
344        let value = cap
345            .get(2)
346            .or_else(|| cap.get(3))
347            .or_else(|| cap.get(4))
348            .or_else(|| cap.get(5))
349            .map(|m| m.as_str().to_string());
350
351        if let Some(v) = value {
352            fields.insert(key, v);
353        }
354    }
355
356    // Parse hash values (PREREQ_PM, BUILD_REQUIRES, etc.)
357    parse_hash_dependencies(content, &mut fields);
358
359    // Parse array refs for AUTHOR
360    parse_author_array(content, &mut fields);
361
362    fields
363}
364
365fn parse_hash_dependencies(content: &str, fields: &mut HashMap<String, String>) {
366    for cap in RE_HASH_BLOCK.captures_iter(content) {
367        let key = cap.get(1).expect("group 1 always exists").as_str();
368        let hash_content = cap.get(2).expect("group 2 always exists").as_str();
369
370        // For dependency hashes, we'll store them with a special marker
371        // so parse_dependencies can find them
372        if matches!(
373            key,
374            "PREREQ_PM" | "BUILD_REQUIRES" | "TEST_REQUIRES" | "CONFIGURE_REQUIRES"
375        ) {
376            fields.insert(format!("_HASH_{}", key), hash_content.to_string());
377        }
378    }
379}
380
381fn parse_author_array(content: &str, fields: &mut HashMap<String, String>) {
382    if let Some(cap) = RE_AUTHOR_ARRAY.captures(content) {
383        let array_content = cap.get(1).expect("group 1 always exists").as_str();
384
385        let authors: Vec<String> = RE_QUOTED_STRING
386            .captures_iter(array_content)
387            .filter_map(|c| c.get(1).map(|m| m.as_str().to_string()))
388            .collect();
389
390        if !authors.is_empty() {
391            // Store as JSON array for later processing
392            fields.insert("_ARRAY_AUTHOR".to_string(), authors.join("||"));
393        }
394    }
395}
396
397fn parse_author(fields: &HashMap<String, String>) -> Vec<Party> {
398    // Check for array of authors first
399    if let Some(authors_str) = fields.get("_ARRAY_AUTHOR") {
400        return authors_str
401            .split("||")
402            .filter_map(|author_str| {
403                if author_str.trim().is_empty() {
404                    return None;
405                }
406                let (name, email) = parse_author_string(author_str);
407                Some(Party {
408                    role: Some("author".to_string()),
409                    name,
410                    email,
411                    r#type: Some("person".to_string()),
412                    url: None,
413                    organization: None,
414                    organization_url: None,
415                    timezone: None,
416                })
417            })
418            .collect();
419    }
420
421    // Single author
422    if let Some(author_str) = fields.get("AUTHOR") {
423        let (name, email) = parse_author_string(author_str);
424        return vec![Party {
425            role: Some("author".to_string()),
426            name,
427            email,
428            r#type: Some("person".to_string()),
429            url: None,
430            organization: None,
431            organization_url: None,
432            timezone: None,
433        }];
434    }
435
436    Vec::new()
437}
438
439fn parse_author_string(s: &str) -> (Option<String>, Option<String>) {
440    // Parse "Name <email@example.com>" format
441    if let Some(start) = s.find('<')
442        && let Some(end) = s.find('>')
443        && start < end
444    {
445        let name = s[..start].trim();
446        let email = s[start + 1..end].trim();
447        return (
448            if name.is_empty() {
449                None
450            } else {
451                Some(name.to_string())
452            },
453            if email.is_empty() {
454                None
455            } else {
456                Some(email.to_string())
457            },
458        );
459    }
460    // No email found, treat entire string as name
461    (Some(s.trim().to_string()), None)
462}
463
464fn parse_dependencies(fields: &HashMap<String, String>) -> Vec<Dependency> {
465    let mut dependencies = Vec::new();
466
467    // Parse PREREQ_PM as runtime dependencies
468    if let Some(hash_content) = fields.get("_HASH_PREREQ_PM") {
469        dependencies.extend(extract_deps_from_hash(hash_content, "runtime", true));
470    }
471
472    // Parse BUILD_REQUIRES
473    if let Some(hash_content) = fields.get("_HASH_BUILD_REQUIRES") {
474        dependencies.extend(extract_deps_from_hash(hash_content, "build", false));
475    }
476
477    // Parse TEST_REQUIRES
478    if let Some(hash_content) = fields.get("_HASH_TEST_REQUIRES") {
479        dependencies.extend(extract_deps_from_hash(hash_content, "test", false));
480    }
481
482    // Parse CONFIGURE_REQUIRES
483    if let Some(hash_content) = fields.get("_HASH_CONFIGURE_REQUIRES") {
484        dependencies.extend(extract_deps_from_hash(hash_content, "configure", false));
485    }
486
487    dependencies
488}
489
490fn extract_deps_from_hash(hash_content: &str, scope: &str, is_runtime: bool) -> Vec<Dependency> {
491    let mut deps = Vec::new();
492
493    for cap in RE_DEP_PAIR.captures_iter(hash_content) {
494        let module_name = cap.get(1).expect("group 1 always exists").as_str();
495
496        // Skip perl itself
497        if module_name == "perl" {
498            continue;
499        }
500
501        let version = cap
502            .get(2)
503            .or_else(|| cap.get(3))
504            .or_else(|| cap.get(4))
505            .map(|m| m.as_str());
506
507        let extracted_requirement = match version {
508            Some("0") | Some("") | None => None,
509            Some(v) => Some(v.to_string()),
510        };
511
512        let purl = PackageUrl::new("cpan", module_name)
513            .ok()
514            .map(|p| p.to_string());
515
516        deps.push(Dependency {
517            purl,
518            extracted_requirement,
519            scope: Some(scope.to_string()),
520            is_runtime: Some(is_runtime),
521            is_optional: Some(false),
522            is_pinned: None,
523            is_direct: Some(true),
524            resolved_package: None,
525            extra_data: None,
526        });
527    }
528
529    deps
530}
531
532crate::register_parser!(
533    "CPAN Perl Makefile.PL",
534    &["*/Makefile.PL"],
535    "cpan",
536    "Perl",
537    Some("https://metacpan.org/pod/ExtUtils::MakeMaker"),
538);