Skip to main content

provenant/parsers/
cpan_makefile_pl.rs

1//! Parser for CPAN Perl Makefile.PL files.
2//!
3//! Extracts Perl package metadata from `Makefile.PL` files used by ExtUtils::MakeMaker.
4//!
5//! # Supported Formats
6//! - `Makefile.PL` - CPAN ExtUtils::MakeMaker build configuration
7//!
8//! # Implementation Notes
9//! - Format: Perl script with WriteMakefile() or WriteMakefile1() calls
10//! - Spec: https://metacpan.org/pod/ExtUtils::MakeMaker
11//! - Extracts: NAME, VERSION, AUTHOR, LICENSE, ABSTRACT, PREREQ_PM, BUILD_REQUIRES, TEST_REQUIRES, CONFIGURE_REQUIRES
12//! - Uses regex-based extraction (no Perl code execution for security)
13//! - Python reference has stub-only handler with no parse() method - this is BEYOND PARITY
14
15use std::collections::HashMap;
16use std::fs;
17use std::path::Path;
18use std::sync::LazyLock;
19
20use log::warn;
21use packageurl::PackageUrl;
22use regex::Regex;
23use serde_json::json;
24
25use crate::models::{DatasourceId, Dependency, PackageData, PackageType, Party};
26
27use super::PackageParser;
28
29static RE_WRITEMAKEFILE: LazyLock<Regex> =
30    LazyLock::new(|| Regex::new(r"WriteMakefile1?\s*\(").unwrap());
31static RE_SIMPLE_KV: LazyLock<Regex> = LazyLock::new(|| {
32    Regex::new(r#"(?m)^\s*([A-Z_]+)\s*=>\s*(?:'([^']*)'|"([^"]*)"|q\{([^}]*)\}|q\(([^)]*)\))"#)
33        .unwrap()
34});
35static RE_HASH_BLOCK: LazyLock<Regex> =
36    LazyLock::new(|| Regex::new(r"([A-Z_]+)\s*=>\s*\{([^}]*)\}").unwrap());
37static RE_AUTHOR_ARRAY: LazyLock<Regex> =
38    LazyLock::new(|| Regex::new(r"AUTHOR\s*=>\s*\[([^\]]*)\]").unwrap());
39static RE_QUOTED_STRING: LazyLock<Regex> =
40    LazyLock::new(|| Regex::new(r#"['"]([^'"]*)['"']"#).unwrap());
41static RE_DEP_PAIR: LazyLock<Regex> = LazyLock::new(|| {
42    Regex::new(r#"['"]([^'"]+)['"]\s*=>\s*(?:'([^']*)'|"([^"]*)"|(\d+))"#).unwrap()
43});
44static RE_VERSION_ASSIGNMENT: LazyLock<Regex> = LazyLock::new(|| {
45    Regex::new(
46        r#"(?m)^\s*(?:our\s+)?\$(?:[A-Za-z_][\w:]*::)?VERSION\s*=\s*(?:'([^']+)'|"([^"]+)")"#,
47    )
48    .unwrap()
49});
50
51const PACKAGE_TYPE: PackageType = PackageType::Cpan;
52const MAX_METADATA_FILE_SIZE: u64 = 1024 * 1024;
53
54pub struct CpanMakefilePlParser;
55
56impl PackageParser for CpanMakefilePlParser {
57    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
58
59    fn is_match(path: &Path) -> bool {
60        path.file_name().is_some_and(|name| name == "Makefile.PL")
61    }
62
63    fn extract_packages(path: &Path) -> Vec<PackageData> {
64        let content = match fs::read_to_string(path) {
65            Ok(c) => c,
66            Err(e) => {
67                warn!("Failed to read Makefile.PL file {:?}: {}", path, e);
68                return vec![PackageData {
69                    package_type: Some(PACKAGE_TYPE),
70                    primary_language: Some("Perl".to_string()),
71                    datasource_id: Some(DatasourceId::CpanMakefile),
72                    ..Default::default()
73                }];
74            }
75        };
76
77        vec![parse_makefile_pl_with_base(&content, path.parent())]
78    }
79}
80
81#[cfg(test)]
82pub(crate) fn parse_makefile_pl(content: &str) -> PackageData {
83    parse_makefile_pl_with_base(content, None)
84}
85
86pub(crate) fn parse_makefile_pl_with_base(content: &str, base_dir: Option<&Path>) -> PackageData {
87    // Find WriteMakefile or WriteMakefile1 call
88    let makefile_block = extract_writemakefile_block(content);
89    if makefile_block.is_empty() {
90        return default_package_data();
91    }
92
93    let fields = parse_hash_fields(&makefile_block);
94
95    let name = fields.get("NAME").map(|n| n.to_string());
96    let resolved_metadata = resolve_referenced_metadata(&fields, base_dir);
97
98    let version = fields
99        .get("VERSION")
100        .map(|v| v.to_string())
101        .or_else(|| resolved_metadata.version.clone());
102    let description = fields
103        .get("ABSTRACT")
104        .map(|d| d.to_string())
105        .or_else(|| resolved_metadata.abstract_text.clone());
106    let extracted_license_statement = fields.get("LICENSE").map(|l| l.to_string());
107
108    let parties = parse_author(&fields);
109    let dependencies = parse_dependencies(&fields);
110
111    let mut extra_data = HashMap::new();
112    if let Some(min_perl) = fields.get("MIN_PERL_VERSION") {
113        extra_data.insert("MIN_PERL_VERSION".to_string(), json!(min_perl));
114    }
115    if let Some(version_from) = fields.get("VERSION_FROM") {
116        extra_data.insert("VERSION_FROM".to_string(), json!(version_from));
117    }
118    if let Some(abstract_from) = fields.get("ABSTRACT_FROM") {
119        extra_data.insert("ABSTRACT_FROM".to_string(), json!(abstract_from));
120    }
121
122    // Build PURL: convert Foo::Bar to Foo-Bar for CPAN naming convention
123    let purl = name.as_ref().and_then(|n| {
124        let purl_name = n.replace("::", "-");
125        PackageUrl::new("cpan", &purl_name).ok().map(|mut p| {
126            if let Some(v) = &version {
127                let _ = p.with_version(v).ok();
128            }
129            p.to_string()
130        })
131    });
132
133    PackageData {
134        package_type: Some(PACKAGE_TYPE),
135        namespace: Some("cpan".to_string()),
136        name,
137        version,
138        description,
139        extracted_license_statement,
140        parties,
141        dependencies,
142        extra_data: if extra_data.is_empty() {
143            None
144        } else {
145            Some(extra_data)
146        },
147        purl,
148        datasource_id: Some(DatasourceId::CpanMakefile),
149        primary_language: Some("Perl".to_string()),
150        ..Default::default()
151    }
152}
153
154#[derive(Default)]
155struct ResolvedMetadata {
156    version: Option<String>,
157    abstract_text: Option<String>,
158}
159
160fn default_package_data() -> PackageData {
161    PackageData {
162        package_type: Some(PACKAGE_TYPE),
163        primary_language: Some("Perl".to_string()),
164        datasource_id: Some(DatasourceId::CpanMakefile),
165        ..Default::default()
166    }
167}
168
169fn resolve_referenced_metadata(
170    fields: &HashMap<String, String>,
171    base_dir: Option<&Path>,
172) -> ResolvedMetadata {
173    let Some(base_dir) = base_dir else {
174        return ResolvedMetadata::default();
175    };
176
177    let mut resolved = ResolvedMetadata::default();
178    let mut cache: HashMap<String, Option<String>> = HashMap::new();
179
180    if let Some(version_from) = fields.get("VERSION_FROM")
181        && let Some(content) = load_referenced_metadata_file(base_dir, version_from, &mut cache)
182    {
183        resolved.version = extract_version_from_module_content(content);
184    }
185
186    if let Some(abstract_from) = fields.get("ABSTRACT_FROM")
187        && let Some(content) = load_referenced_metadata_file(base_dir, abstract_from, &mut cache)
188    {
189        resolved.abstract_text = extract_abstract_from_module_content(content);
190    }
191
192    resolved
193}
194
195fn load_referenced_metadata_file<'a>(
196    base_dir: &Path,
197    relative_path: &str,
198    cache: &'a mut HashMap<String, Option<String>>,
199) -> Option<&'a String> {
200    let entry = cache
201        .entry(relative_path.to_string())
202        .or_insert_with(|| read_safe_metadata_file(base_dir, relative_path));
203    entry.as_ref()
204}
205
206fn read_safe_metadata_file(base_dir: &Path, relative_path: &str) -> Option<String> {
207    let ref_path = Path::new(relative_path);
208    if ref_path.is_absolute() {
209        return None;
210    }
211
212    let base_dir = base_dir.canonicalize().ok()?;
213    let candidate = base_dir.join(ref_path);
214    let canonical_candidate = candidate.canonicalize().ok()?;
215    if !canonical_candidate.starts_with(&base_dir) {
216        return None;
217    }
218
219    let metadata = fs::metadata(&canonical_candidate).ok()?;
220    if !metadata.is_file() || metadata.len() > MAX_METADATA_FILE_SIZE {
221        return None;
222    }
223
224    fs::read_to_string(canonical_candidate).ok()
225}
226
227fn extract_version_from_module_content(content: &str) -> Option<String> {
228    RE_VERSION_ASSIGNMENT
229        .captures(content)
230        .and_then(|caps| caps.get(1).or_else(|| caps.get(2)))
231        .map(|m| m.as_str().trim().to_string())
232        .filter(|value| !value.is_empty())
233}
234
235fn extract_abstract_from_module_content(content: &str) -> Option<String> {
236    let mut in_name_section = false;
237
238    for line in content.lines() {
239        let trimmed = line.trim();
240        if trimmed == "=head1 NAME" {
241            in_name_section = true;
242            continue;
243        }
244
245        if in_name_section {
246            if trimmed.starts_with('=') {
247                break;
248            }
249            if trimmed.is_empty() {
250                continue;
251            }
252
253            if let Some((_, abstract_text)) = trimmed.split_once(" - ") {
254                let abstract_text = abstract_text.trim();
255                if !abstract_text.is_empty() {
256                    return Some(abstract_text.to_string());
257                }
258            }
259        }
260    }
261
262    None
263}
264
265fn extract_writemakefile_block(content: &str) -> String {
266    let start_match = match RE_WRITEMAKEFILE.find(content) {
267        Some(m) => m,
268        None => return String::new(),
269    };
270
271    let start_pos = start_match.end();
272    let content_from_start = &content[start_pos..];
273
274    // Find the matching closing parenthesis
275    let mut depth = 1;
276    let mut end_pos = 0;
277    let chars: Vec<char> = content_from_start.chars().collect();
278
279    for (i, &ch) in chars.iter().enumerate() {
280        match ch {
281            '(' => depth += 1,
282            ')' => {
283                depth -= 1;
284                if depth == 0 {
285                    end_pos = i;
286                    break;
287                }
288            }
289            _ => {}
290        }
291    }
292
293    if end_pos > 0 {
294        content_from_start[..end_pos].to_string()
295    } else {
296        String::new()
297    }
298}
299
300fn parse_hash_fields(content: &str) -> HashMap<String, String> {
301    let mut fields = HashMap::new();
302
303    for cap in RE_SIMPLE_KV.captures_iter(content) {
304        let key = cap
305            .get(1)
306            .expect("group 1 always exists")
307            .as_str()
308            .to_string();
309        let value = cap
310            .get(2)
311            .or_else(|| cap.get(3))
312            .or_else(|| cap.get(4))
313            .or_else(|| cap.get(5))
314            .map(|m| m.as_str().to_string());
315
316        if let Some(v) = value {
317            fields.insert(key, v);
318        }
319    }
320
321    // Parse hash values (PREREQ_PM, BUILD_REQUIRES, etc.)
322    parse_hash_dependencies(content, &mut fields);
323
324    // Parse array refs for AUTHOR
325    parse_author_array(content, &mut fields);
326
327    fields
328}
329
330fn parse_hash_dependencies(content: &str, fields: &mut HashMap<String, String>) {
331    for cap in RE_HASH_BLOCK.captures_iter(content) {
332        let key = cap.get(1).expect("group 1 always exists").as_str();
333        let hash_content = cap.get(2).expect("group 2 always exists").as_str();
334
335        // For dependency hashes, we'll store them with a special marker
336        // so parse_dependencies can find them
337        if matches!(
338            key,
339            "PREREQ_PM" | "BUILD_REQUIRES" | "TEST_REQUIRES" | "CONFIGURE_REQUIRES"
340        ) {
341            fields.insert(format!("_HASH_{}", key), hash_content.to_string());
342        }
343    }
344}
345
346fn parse_author_array(content: &str, fields: &mut HashMap<String, String>) {
347    if let Some(cap) = RE_AUTHOR_ARRAY.captures(content) {
348        let array_content = cap.get(1).expect("group 1 always exists").as_str();
349
350        let authors: Vec<String> = RE_QUOTED_STRING
351            .captures_iter(array_content)
352            .filter_map(|c| c.get(1).map(|m| m.as_str().to_string()))
353            .collect();
354
355        if !authors.is_empty() {
356            // Store as JSON array for later processing
357            fields.insert("_ARRAY_AUTHOR".to_string(), authors.join("||"));
358        }
359    }
360}
361
362fn parse_author(fields: &HashMap<String, String>) -> Vec<Party> {
363    // Check for array of authors first
364    if let Some(authors_str) = fields.get("_ARRAY_AUTHOR") {
365        return authors_str
366            .split("||")
367            .filter_map(|author_str| {
368                if author_str.trim().is_empty() {
369                    return None;
370                }
371                let (name, email) = parse_author_string(author_str);
372                Some(Party {
373                    role: Some("author".to_string()),
374                    name,
375                    email,
376                    r#type: Some("person".to_string()),
377                    url: None,
378                    organization: None,
379                    organization_url: None,
380                    timezone: None,
381                })
382            })
383            .collect();
384    }
385
386    // Single author
387    if let Some(author_str) = fields.get("AUTHOR") {
388        let (name, email) = parse_author_string(author_str);
389        return vec![Party {
390            role: Some("author".to_string()),
391            name,
392            email,
393            r#type: Some("person".to_string()),
394            url: None,
395            organization: None,
396            organization_url: None,
397            timezone: None,
398        }];
399    }
400
401    Vec::new()
402}
403
404fn parse_author_string(s: &str) -> (Option<String>, Option<String>) {
405    // Parse "Name <email@example.com>" format
406    if let Some(start) = s.find('<')
407        && let Some(end) = s.find('>')
408        && start < end
409    {
410        let name = s[..start].trim();
411        let email = s[start + 1..end].trim();
412        return (
413            if name.is_empty() {
414                None
415            } else {
416                Some(name.to_string())
417            },
418            if email.is_empty() {
419                None
420            } else {
421                Some(email.to_string())
422            },
423        );
424    }
425    // No email found, treat entire string as name
426    (Some(s.trim().to_string()), None)
427}
428
429fn parse_dependencies(fields: &HashMap<String, String>) -> Vec<Dependency> {
430    let mut dependencies = Vec::new();
431
432    // Parse PREREQ_PM as runtime dependencies
433    if let Some(hash_content) = fields.get("_HASH_PREREQ_PM") {
434        dependencies.extend(extract_deps_from_hash(hash_content, "runtime", true));
435    }
436
437    // Parse BUILD_REQUIRES
438    if let Some(hash_content) = fields.get("_HASH_BUILD_REQUIRES") {
439        dependencies.extend(extract_deps_from_hash(hash_content, "build", false));
440    }
441
442    // Parse TEST_REQUIRES
443    if let Some(hash_content) = fields.get("_HASH_TEST_REQUIRES") {
444        dependencies.extend(extract_deps_from_hash(hash_content, "test", false));
445    }
446
447    // Parse CONFIGURE_REQUIRES
448    if let Some(hash_content) = fields.get("_HASH_CONFIGURE_REQUIRES") {
449        dependencies.extend(extract_deps_from_hash(hash_content, "configure", false));
450    }
451
452    dependencies
453}
454
455fn extract_deps_from_hash(hash_content: &str, scope: &str, is_runtime: bool) -> Vec<Dependency> {
456    let mut deps = Vec::new();
457
458    for cap in RE_DEP_PAIR.captures_iter(hash_content) {
459        let module_name = cap.get(1).expect("group 1 always exists").as_str();
460
461        // Skip perl itself
462        if module_name == "perl" {
463            continue;
464        }
465
466        let version = cap
467            .get(2)
468            .or_else(|| cap.get(3))
469            .or_else(|| cap.get(4))
470            .map(|m| m.as_str());
471
472        let extracted_requirement = match version {
473            Some("0") | Some("") | None => None,
474            Some(v) => Some(v.to_string()),
475        };
476
477        let purl = PackageUrl::new("cpan", module_name)
478            .ok()
479            .map(|p| p.to_string());
480
481        deps.push(Dependency {
482            purl,
483            extracted_requirement,
484            scope: Some(scope.to_string()),
485            is_runtime: Some(is_runtime),
486            is_optional: Some(false),
487            is_pinned: None,
488            is_direct: Some(true),
489            resolved_package: None,
490            extra_data: None,
491        });
492    }
493
494    deps
495}
496
497crate::register_parser!(
498    "CPAN Perl Makefile.PL",
499    &["*/Makefile.PL"],
500    "cpan",
501    "Perl",
502    Some("https://metacpan.org/pod/ExtUtils::MakeMaker"),
503);