Skip to main content

provenant/parsers/
cpan_makefile_pl.rs

1//! Parser for CPAN Perl Makefile.PL files.
2//!
3//! Extracts Perl package metadata from `Makefile.PL` files used by ExtUtils::MakeMaker.
4//!
5//! # Supported Formats
6//! - `Makefile.PL` - CPAN ExtUtils::MakeMaker build configuration
7//!
8//! # Implementation Notes
9//! - Format: Perl script with WriteMakefile() or WriteMakefile1() calls
10//! - Spec: https://metacpan.org/pod/ExtUtils::MakeMaker
11//! - Extracts: NAME, VERSION, AUTHOR, LICENSE, ABSTRACT, PREREQ_PM, BUILD_REQUIRES, TEST_REQUIRES, CONFIGURE_REQUIRES
12//! - Uses regex-based extraction (no Perl code execution for security)
13//! - Python reference has stub-only handler with no parse() method - this is BEYOND PARITY
14
15use std::collections::HashMap;
16use std::path::Path;
17use std::sync::LazyLock;
18
19use crate::parser_warn as warn;
20use crate::parsers::utils::{MAX_ITERATION_COUNT, read_file_to_string, truncate_field};
21use packageurl::PackageUrl;
22use regex::Regex;
23use serde_json::json;
24
25use crate::models::{DatasourceId, Dependency, PackageData, PackageType, Party};
26
27use super::PackageParser;
28use super::license_normalization::{
29    DeclaredLicenseMatchMetadata, NormalizedDeclaredLicense, build_declared_license_data,
30    empty_declared_license_data, normalize_declared_license_key, normalize_spdx_expression,
31};
32
33static RE_WRITEMAKEFILE: LazyLock<Regex> = LazyLock::new(|| {
34    Regex::new(r"WriteMakefile1?\s*\(").expect("valid regex: WriteMakefile call pattern")
35});
36static RE_SIMPLE_KV: LazyLock<Regex> = LazyLock::new(|| {
37    Regex::new(r#"(?m)^\s*([A-Z_]+)\s*=>\s*(?:'([^']*)'|"([^"]*)"|q\{([^}]*)\}|q\(([^)]*)\))"#)
38        .expect("valid regex: simple key=>value pattern")
39});
40static RE_HASH_BLOCK: LazyLock<Regex> = LazyLock::new(|| {
41    Regex::new(r"([A-Z_]+)\s*=>\s*\{([^}]*)\}").expect("valid regex: hash block pattern")
42});
43static RE_AUTHOR_ARRAY: LazyLock<Regex> = LazyLock::new(|| {
44    Regex::new(r"AUTHOR\s*=>\s*\[([^\]]*)\]").expect("valid regex: AUTHOR array pattern")
45});
46static RE_QUOTED_STRING: LazyLock<Regex> = LazyLock::new(|| {
47    Regex::new(r#"['"]([^'"]*)['"']"#).expect("valid regex: quoted string pattern")
48});
49static RE_DEP_PAIR: LazyLock<Regex> = LazyLock::new(|| {
50    Regex::new(r#"['"]([^'"]+)['"]\s*=>\s*(?:'([^']*)'|"([^"]*)"|(\d+))"#)
51        .expect("valid regex: dependency pair pattern")
52});
53static RE_VERSION_ASSIGNMENT: LazyLock<Regex> = LazyLock::new(|| {
54    Regex::new(
55        r#"(?m)^\s*(?:our\s+)?\$(?:[A-Za-z_][\w:]*::)?VERSION\s*=\s*(?:'([^']+)'|"([^"]+)")"#,
56    )
57    .expect("valid regex: VERSION assignment pattern")
58});
59
60const PACKAGE_TYPE: PackageType = PackageType::Cpan;
61const MAX_METADATA_FILE_SIZE: u64 = 1024 * 1024;
62
63pub struct CpanMakefilePlParser;
64
65impl PackageParser for CpanMakefilePlParser {
66    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
67
68    fn is_match(path: &Path) -> bool {
69        path.file_name().is_some_and(|name| name == "Makefile.PL")
70    }
71
72    fn extract_packages(path: &Path) -> Vec<PackageData> {
73        let content = match read_file_to_string(path, None) {
74            Ok(c) => c,
75            Err(e) => {
76                warn!("Failed to read Makefile.PL file {:?}: {}", path, e);
77                return vec![PackageData {
78                    package_type: Some(PACKAGE_TYPE),
79                    primary_language: Some("Perl".to_string()),
80                    datasource_id: Some(DatasourceId::CpanMakefile),
81                    ..Default::default()
82                }];
83            }
84        };
85
86        vec![parse_makefile_pl_with_base(&content, path.parent())]
87    }
88}
89
90#[cfg(test)]
91pub(crate) fn parse_makefile_pl(content: &str) -> PackageData {
92    parse_makefile_pl_with_base(content, None)
93}
94
95pub(crate) fn parse_makefile_pl_with_base(content: &str, base_dir: Option<&Path>) -> PackageData {
96    // Find WriteMakefile or WriteMakefile1 call
97    let makefile_block = extract_writemakefile_block(content);
98    if makefile_block.is_empty() {
99        return default_package_data();
100    }
101
102    let fields = parse_hash_fields(&makefile_block);
103
104    let name = fields.get("NAME").map(|n| truncate_field(n.to_string()));
105    let resolved_metadata = resolve_referenced_metadata(&fields, base_dir);
106
107    let version = fields
108        .get("VERSION")
109        .map(|v| truncate_field(v.to_string()))
110        .or_else(|| resolved_metadata.version.clone());
111    let description = fields
112        .get("ABSTRACT")
113        .map(|d| truncate_field(d.to_string()))
114        .or_else(|| resolved_metadata.abstract_text.clone());
115    let extracted_license_statement = fields.get("LICENSE").map(|l| truncate_field(l.to_string()));
116    let (declared_license_expression, declared_license_expression_spdx, license_detections) =
117        extracted_license_statement
118            .as_deref()
119            .and_then(normalize_cpan_makefile_license)
120            .map(|normalized| {
121                build_declared_license_data(
122                    normalized,
123                    DeclaredLicenseMatchMetadata::single_line(
124                        extracted_license_statement.as_deref().unwrap_or_default(),
125                    ),
126                )
127            })
128            .unwrap_or_else(empty_declared_license_data);
129
130    let parties = parse_author(&fields);
131    let dependencies = parse_dependencies(&fields);
132
133    let mut extra_data = HashMap::new();
134    if let Some(min_perl) = fields.get("MIN_PERL_VERSION") {
135        extra_data.insert(
136            "MIN_PERL_VERSION".to_string(),
137            json!(truncate_field(min_perl.to_string())),
138        );
139    }
140    if let Some(version_from) = fields.get("VERSION_FROM") {
141        extra_data.insert(
142            "VERSION_FROM".to_string(),
143            json!(truncate_field(version_from.to_string())),
144        );
145    }
146    if let Some(abstract_from) = fields.get("ABSTRACT_FROM") {
147        extra_data.insert(
148            "ABSTRACT_FROM".to_string(),
149            json!(truncate_field(abstract_from.to_string())),
150        );
151    }
152
153    // Build PURL: convert Foo::Bar to Foo-Bar for CPAN naming convention
154    let purl = name.as_ref().and_then(|n| {
155        let purl_name = n.replace("::", "-");
156        PackageUrl::new("cpan", &purl_name).ok().map(|mut p| {
157            if let Some(v) = &version {
158                let _ = p.with_version(v).ok();
159            }
160            p.to_string()
161        })
162    });
163
164    PackageData {
165        package_type: Some(PACKAGE_TYPE),
166        namespace: Some("cpan".to_string()),
167        name,
168        version,
169        description,
170        declared_license_expression,
171        declared_license_expression_spdx,
172        license_detections,
173        extracted_license_statement,
174        parties,
175        dependencies,
176        extra_data: if extra_data.is_empty() {
177            None
178        } else {
179            Some(extra_data)
180        },
181        purl,
182        datasource_id: Some(DatasourceId::CpanMakefile),
183        primary_language: Some("Perl".to_string()),
184        ..Default::default()
185    }
186}
187
188#[derive(Default)]
189struct ResolvedMetadata {
190    version: Option<String>,
191    abstract_text: Option<String>,
192}
193
194fn default_package_data() -> PackageData {
195    PackageData {
196        package_type: Some(PACKAGE_TYPE),
197        primary_language: Some("Perl".to_string()),
198        datasource_id: Some(DatasourceId::CpanMakefile),
199        ..Default::default()
200    }
201}
202
203fn normalize_cpan_makefile_license(value: &str) -> Option<NormalizedDeclaredLicense> {
204    match value.trim() {
205        "perl_5" | "Perl_5" => Some(NormalizedDeclaredLicense::new(
206            "gpl-1.0-plus OR artistic-perl-1.0",
207            "GPL-1.0-or-later OR Artistic-1.0-Perl",
208        )),
209        "artistic_2" => Some(NormalizedDeclaredLicense::new(
210            "artistic-2.0",
211            "Artistic-2.0",
212        )),
213        "apache_2_0" => Some(NormalizedDeclaredLicense::new("apache-2.0", "Apache-2.0")),
214        other => normalize_spdx_expression(other).or_else(|| normalize_declared_license_key(other)),
215    }
216}
217
218fn resolve_referenced_metadata(
219    fields: &HashMap<String, String>,
220    base_dir: Option<&Path>,
221) -> ResolvedMetadata {
222    let Some(base_dir) = base_dir else {
223        return ResolvedMetadata::default();
224    };
225
226    let mut resolved = ResolvedMetadata::default();
227    let mut cache: HashMap<String, Option<String>> = HashMap::new();
228
229    if let Some(version_from) = fields.get("VERSION_FROM")
230        && let Some(content) = load_referenced_metadata_file(base_dir, version_from, &mut cache)
231    {
232        resolved.version = extract_version_from_module_content(content);
233    }
234
235    if let Some(abstract_from) = fields.get("ABSTRACT_FROM")
236        && let Some(content) = load_referenced_metadata_file(base_dir, abstract_from, &mut cache)
237    {
238        resolved.abstract_text = extract_abstract_from_module_content(content);
239    }
240
241    resolved
242}
243
244fn load_referenced_metadata_file<'a>(
245    base_dir: &Path,
246    relative_path: &str,
247    cache: &'a mut HashMap<String, Option<String>>,
248) -> Option<&'a String> {
249    let entry = cache
250        .entry(relative_path.to_string())
251        .or_insert_with(|| read_safe_metadata_file(base_dir, relative_path));
252    entry.as_ref()
253}
254
255fn read_safe_metadata_file(base_dir: &Path, relative_path: &str) -> Option<String> {
256    let ref_path = Path::new(relative_path);
257    if ref_path.is_absolute() {
258        return None;
259    }
260
261    let base_dir = base_dir.canonicalize().ok()?;
262    let candidate = base_dir.join(ref_path);
263    let canonical_candidate = candidate.canonicalize().ok()?;
264    if !canonical_candidate.starts_with(&base_dir) {
265        return None;
266    }
267
268    let metadata = std::fs::metadata(&canonical_candidate).ok()?;
269    if !metadata.is_file() || metadata.len() > MAX_METADATA_FILE_SIZE {
270        return None;
271    }
272
273    read_file_to_string(&canonical_candidate, None).ok()
274}
275
276fn extract_version_from_module_content(content: &str) -> Option<String> {
277    RE_VERSION_ASSIGNMENT
278        .captures(content)
279        .and_then(|caps| caps.get(1).or_else(|| caps.get(2)))
280        .map(|m| m.as_str().trim().to_string())
281        .map(truncate_field)
282        .filter(|value| !value.is_empty())
283}
284
285fn extract_abstract_from_module_content(content: &str) -> Option<String> {
286    let mut in_name_section = false;
287
288    for line in content.lines() {
289        let trimmed = line.trim();
290        if trimmed == "=head1 NAME" {
291            in_name_section = true;
292            continue;
293        }
294
295        if in_name_section {
296            if trimmed.starts_with('=') {
297                break;
298            }
299            if trimmed.is_empty() {
300                continue;
301            }
302
303            if let Some((_, abstract_text)) = trimmed.split_once(" - ") {
304                let abstract_text = abstract_text.trim();
305                if !abstract_text.is_empty() {
306                    return Some(truncate_field(abstract_text.to_string()));
307                }
308            }
309        }
310    }
311
312    None
313}
314
315fn extract_writemakefile_block(content: &str) -> String {
316    let start_match = match RE_WRITEMAKEFILE.find(content) {
317        Some(m) => m,
318        None => return String::new(),
319    };
320
321    let start_pos = start_match.end();
322    let content_from_start = &content[start_pos..];
323
324    // Find the matching closing parenthesis
325    let mut depth = 1;
326    let mut end_pos = 0;
327    let chars: Vec<char> = content_from_start.chars().collect();
328
329    for (i, &ch) in chars.iter().enumerate() {
330        if i >= MAX_ITERATION_COUNT {
331            break;
332        }
333        match ch {
334            '(' => depth += 1,
335            ')' => {
336                depth -= 1;
337                if depth == 0 {
338                    end_pos = i;
339                    break;
340                }
341            }
342            _ => {}
343        }
344    }
345
346    if end_pos > 0 {
347        content_from_start[..end_pos].to_string()
348    } else {
349        String::new()
350    }
351}
352
353fn parse_hash_fields(content: &str) -> HashMap<String, String> {
354    let mut fields = HashMap::new();
355
356    for cap in RE_SIMPLE_KV
357        .captures_iter(content)
358        .take(MAX_ITERATION_COUNT)
359    {
360        let key = cap.get(1).map(|m| m.as_str()).unwrap_or("").to_string();
361        let value = cap
362            .get(2)
363            .or_else(|| cap.get(3))
364            .or_else(|| cap.get(4))
365            .or_else(|| cap.get(5))
366            .map(|m| m.as_str().to_string());
367
368        if let Some(v) = value {
369            fields.insert(key, v);
370        }
371    }
372
373    // Parse hash values (PREREQ_PM, BUILD_REQUIRES, etc.)
374    parse_hash_dependencies(content, &mut fields);
375
376    // Parse array refs for AUTHOR
377    parse_author_array(content, &mut fields);
378
379    fields
380}
381
382fn parse_hash_dependencies(content: &str, fields: &mut HashMap<String, String>) {
383    for cap in RE_HASH_BLOCK
384        .captures_iter(content)
385        .take(MAX_ITERATION_COUNT)
386    {
387        let key = cap.get(1).map(|m| m.as_str()).unwrap_or("");
388        let hash_content = cap.get(2).map(|m| m.as_str()).unwrap_or("");
389
390        // For dependency hashes, we'll store them with a special marker
391        // so parse_dependencies can find them
392        if matches!(
393            key,
394            "PREREQ_PM" | "BUILD_REQUIRES" | "TEST_REQUIRES" | "CONFIGURE_REQUIRES"
395        ) {
396            fields.insert(format!("_HASH_{}", key), hash_content.to_string());
397        }
398    }
399}
400
401fn parse_author_array(content: &str, fields: &mut HashMap<String, String>) {
402    if let Some(cap) = RE_AUTHOR_ARRAY.captures(content) {
403        let array_content = cap.get(1).map(|m| m.as_str()).unwrap_or("");
404
405        let authors: Vec<String> = RE_QUOTED_STRING
406            .captures_iter(array_content)
407            .take(MAX_ITERATION_COUNT)
408            .filter_map(|c| c.get(1).map(|m| m.as_str().to_string()))
409            .collect();
410
411        if !authors.is_empty() {
412            // Store as JSON array for later processing
413            fields.insert("_ARRAY_AUTHOR".to_string(), authors.join("||"));
414        }
415    }
416}
417
418fn parse_author(fields: &HashMap<String, String>) -> Vec<Party> {
419    // Check for array of authors first
420    if let Some(authors_str) = fields.get("_ARRAY_AUTHOR") {
421        return authors_str
422            .split("||")
423            .filter_map(|author_str| {
424                if author_str.trim().is_empty() {
425                    return None;
426                }
427                let (name, email) = parse_author_string(author_str);
428                Some(Party {
429                    role: Some("author".to_string()),
430                    name,
431                    email,
432                    r#type: Some("person".to_string()),
433                    url: None,
434                    organization: None,
435                    organization_url: None,
436                    timezone: None,
437                })
438            })
439            .collect();
440    }
441
442    if let Some(author_str) = fields.get("AUTHOR") {
443        let (name, email) = parse_author_string(author_str);
444        return vec![Party {
445            role: Some("author".to_string()),
446            name,
447            email,
448            r#type: Some("person".to_string()),
449            url: None,
450            organization: None,
451            organization_url: None,
452            timezone: None,
453        }];
454    }
455
456    Vec::new()
457}
458
459fn parse_author_string(s: &str) -> (Option<String>, Option<String>) {
460    if let Some(start) = s.find('<')
461        && let Some(end) = s.find('>')
462        && start < end
463    {
464        let name = s[..start].trim();
465        let email = s[start + 1..end].trim();
466        return (
467            if name.is_empty() {
468                None
469            } else {
470                Some(truncate_field(name.to_string()))
471            },
472            if email.is_empty() {
473                None
474            } else {
475                Some(truncate_field(email.to_string()))
476            },
477        );
478    }
479    (Some(truncate_field(s.trim().to_string())), None)
480}
481
482fn parse_dependencies(fields: &HashMap<String, String>) -> Vec<Dependency> {
483    let mut dependencies = Vec::new();
484
485    // Parse PREREQ_PM as runtime dependencies
486    if let Some(hash_content) = fields.get("_HASH_PREREQ_PM") {
487        dependencies.extend(extract_deps_from_hash(hash_content, "runtime", true));
488    }
489
490    // Parse BUILD_REQUIRES
491    if let Some(hash_content) = fields.get("_HASH_BUILD_REQUIRES") {
492        dependencies.extend(extract_deps_from_hash(hash_content, "build", false));
493    }
494
495    // Parse TEST_REQUIRES
496    if let Some(hash_content) = fields.get("_HASH_TEST_REQUIRES") {
497        dependencies.extend(extract_deps_from_hash(hash_content, "test", false));
498    }
499
500    // Parse CONFIGURE_REQUIRES
501    if let Some(hash_content) = fields.get("_HASH_CONFIGURE_REQUIRES") {
502        dependencies.extend(extract_deps_from_hash(hash_content, "configure", false));
503    }
504
505    dependencies
506}
507
508fn extract_deps_from_hash(hash_content: &str, scope: &str, is_runtime: bool) -> Vec<Dependency> {
509    let mut deps = Vec::new();
510
511    for cap in RE_DEP_PAIR
512        .captures_iter(hash_content)
513        .take(MAX_ITERATION_COUNT)
514    {
515        let module_name = cap.get(1).map(|m| m.as_str()).unwrap_or("");
516
517        // Skip perl itself
518        if module_name == "perl" {
519            continue;
520        }
521
522        let version = cap
523            .get(2)
524            .or_else(|| cap.get(3))
525            .or_else(|| cap.get(4))
526            .map(|m| m.as_str());
527
528        let extracted_requirement = match version {
529            Some("0") | Some("") | None => None,
530            Some(v) => Some(truncate_field(v.to_string())),
531        };
532
533        let purl = PackageUrl::new("cpan", module_name)
534            .ok()
535            .map(|p| p.to_string());
536
537        deps.push(Dependency {
538            purl,
539            extracted_requirement,
540            scope: Some(truncate_field(scope.to_string())),
541            is_runtime: Some(is_runtime),
542            is_optional: Some(false),
543            is_pinned: None,
544            is_direct: Some(true),
545            resolved_package: None,
546            extra_data: None,
547        });
548    }
549
550    deps
551}
552
553crate::register_parser!(
554    "CPAN Perl Makefile.PL",
555    &["*/Makefile.PL"],
556    "cpan",
557    "Perl",
558    Some("https://metacpan.org/pod/ExtUtils::MakeMaker"),
559);