Skip to main content

provenant/parsers/
cpan_makefile_pl.rs

1//! Parser for CPAN Perl Makefile.PL files.
2//!
3//! Extracts Perl package metadata from `Makefile.PL` files used by ExtUtils::MakeMaker.
4//!
5//! # Supported Formats
6//! - `Makefile.PL` - CPAN ExtUtils::MakeMaker build configuration
7//!
8//! # Implementation Notes
9//! - Format: Perl script with WriteMakefile() or WriteMakefile1() calls
10//! - Spec: https://metacpan.org/pod/ExtUtils::MakeMaker
11//! - Extracts: NAME, VERSION, AUTHOR, LICENSE, ABSTRACT, PREREQ_PM, BUILD_REQUIRES, TEST_REQUIRES, CONFIGURE_REQUIRES
12//! - Uses regex-based extraction (no Perl code execution for security)
13//! - Python reference has stub-only handler with no parse() method - this is BEYOND PARITY
14
15use std::collections::HashMap;
16use std::fs;
17use std::path::Path;
18use std::sync::LazyLock;
19
20use log::warn;
21use packageurl::PackageUrl;
22use regex::Regex;
23use serde_json::json;
24
25use crate::models::{DatasourceId, Dependency, PackageData, PackageType, Party};
26
27use super::PackageParser;
28
29static RE_WRITEMAKEFILE: LazyLock<Regex> =
30    LazyLock::new(|| Regex::new(r"WriteMakefile1?\s*\(").unwrap());
31static RE_SIMPLE_KV: LazyLock<Regex> = LazyLock::new(|| {
32    Regex::new(r#"(?m)^\s*([A-Z_]+)\s*=>\s*(?:'([^']*)'|"([^"]*)"|q\{([^}]*)\}|q\(([^)]*)\))"#)
33        .unwrap()
34});
35static RE_HASH_BLOCK: LazyLock<Regex> =
36    LazyLock::new(|| Regex::new(r"([A-Z_]+)\s*=>\s*\{([^}]*)\}").unwrap());
37static RE_AUTHOR_ARRAY: LazyLock<Regex> =
38    LazyLock::new(|| Regex::new(r"AUTHOR\s*=>\s*\[([^\]]*)\]").unwrap());
39static RE_QUOTED_STRING: LazyLock<Regex> =
40    LazyLock::new(|| Regex::new(r#"['"]([^'"]*)['"']"#).unwrap());
41static RE_DEP_PAIR: LazyLock<Regex> = LazyLock::new(|| {
42    Regex::new(r#"['"]([^'"]+)['"]\s*=>\s*(?:'([^']*)'|"([^"]*)"|(\d+))"#).unwrap()
43});
44
45const PACKAGE_TYPE: PackageType = PackageType::Cpan;
46
47pub struct CpanMakefilePlParser;
48
49impl PackageParser for CpanMakefilePlParser {
50    const PACKAGE_TYPE: PackageType = PACKAGE_TYPE;
51
52    fn is_match(path: &Path) -> bool {
53        path.file_name().is_some_and(|name| name == "Makefile.PL")
54    }
55
56    fn extract_packages(path: &Path) -> Vec<PackageData> {
57        let content = match fs::read_to_string(path) {
58            Ok(c) => c,
59            Err(e) => {
60                warn!("Failed to read Makefile.PL file {:?}: {}", path, e);
61                return vec![PackageData {
62                    package_type: Some(PACKAGE_TYPE),
63                    primary_language: Some("Perl".to_string()),
64                    datasource_id: Some(DatasourceId::CpanMakefile),
65                    ..Default::default()
66                }];
67            }
68        };
69
70        vec![parse_makefile_pl(&content)]
71    }
72}
73
74pub(crate) fn parse_makefile_pl(content: &str) -> PackageData {
75    // Find WriteMakefile or WriteMakefile1 call
76    let makefile_block = extract_writemakefile_block(content);
77    if makefile_block.is_empty() {
78        return default_package_data();
79    }
80
81    let fields = parse_hash_fields(&makefile_block);
82
83    let name = fields.get("NAME").map(|n| n.to_string());
84    let version = fields.get("VERSION").map(|v| v.to_string());
85    let description = fields.get("ABSTRACT").map(|d| d.to_string());
86    let extracted_license_statement = fields.get("LICENSE").map(|l| l.to_string());
87
88    let parties = parse_author(&fields);
89    let dependencies = parse_dependencies(&fields);
90
91    let mut extra_data = HashMap::new();
92    if let Some(min_perl) = fields.get("MIN_PERL_VERSION") {
93        extra_data.insert("MIN_PERL_VERSION".to_string(), json!(min_perl));
94    }
95    if let Some(version_from) = fields.get("VERSION_FROM") {
96        extra_data.insert("VERSION_FROM".to_string(), json!(version_from));
97    }
98    if let Some(abstract_from) = fields.get("ABSTRACT_FROM") {
99        extra_data.insert("ABSTRACT_FROM".to_string(), json!(abstract_from));
100    }
101
102    // Build PURL: convert Foo::Bar to Foo-Bar for CPAN naming convention
103    let purl = name.as_ref().and_then(|n| {
104        let purl_name = n.replace("::", "-");
105        PackageUrl::new("cpan", &purl_name).ok().map(|mut p| {
106            if let Some(v) = &version {
107                let _ = p.with_version(v).ok();
108            }
109            p.to_string()
110        })
111    });
112
113    PackageData {
114        package_type: Some(PACKAGE_TYPE),
115        namespace: Some("cpan".to_string()),
116        name,
117        version,
118        description,
119        extracted_license_statement,
120        parties,
121        dependencies,
122        extra_data: if extra_data.is_empty() {
123            None
124        } else {
125            Some(extra_data)
126        },
127        purl,
128        datasource_id: Some(DatasourceId::CpanMakefile),
129        primary_language: Some("Perl".to_string()),
130        ..Default::default()
131    }
132}
133
134fn default_package_data() -> PackageData {
135    PackageData {
136        package_type: Some(PACKAGE_TYPE),
137        primary_language: Some("Perl".to_string()),
138        datasource_id: Some(DatasourceId::CpanMakefile),
139        ..Default::default()
140    }
141}
142
143fn extract_writemakefile_block(content: &str) -> String {
144    let start_match = match RE_WRITEMAKEFILE.find(content) {
145        Some(m) => m,
146        None => return String::new(),
147    };
148
149    let start_pos = start_match.end();
150    let content_from_start = &content[start_pos..];
151
152    // Find the matching closing parenthesis
153    let mut depth = 1;
154    let mut end_pos = 0;
155    let chars: Vec<char> = content_from_start.chars().collect();
156
157    for (i, &ch) in chars.iter().enumerate() {
158        match ch {
159            '(' => depth += 1,
160            ')' => {
161                depth -= 1;
162                if depth == 0 {
163                    end_pos = i;
164                    break;
165                }
166            }
167            _ => {}
168        }
169    }
170
171    if end_pos > 0 {
172        content_from_start[..end_pos].to_string()
173    } else {
174        String::new()
175    }
176}
177
178fn parse_hash_fields(content: &str) -> HashMap<String, String> {
179    let mut fields = HashMap::new();
180
181    for cap in RE_SIMPLE_KV.captures_iter(content) {
182        let key = cap
183            .get(1)
184            .expect("group 1 always exists")
185            .as_str()
186            .to_string();
187        let value = cap
188            .get(2)
189            .or_else(|| cap.get(3))
190            .or_else(|| cap.get(4))
191            .or_else(|| cap.get(5))
192            .map(|m| m.as_str().to_string());
193
194        if let Some(v) = value {
195            fields.insert(key, v);
196        }
197    }
198
199    // Parse hash values (PREREQ_PM, BUILD_REQUIRES, etc.)
200    parse_hash_dependencies(content, &mut fields);
201
202    // Parse array refs for AUTHOR
203    parse_author_array(content, &mut fields);
204
205    fields
206}
207
208fn parse_hash_dependencies(content: &str, fields: &mut HashMap<String, String>) {
209    for cap in RE_HASH_BLOCK.captures_iter(content) {
210        let key = cap.get(1).expect("group 1 always exists").as_str();
211        let hash_content = cap.get(2).expect("group 2 always exists").as_str();
212
213        // For dependency hashes, we'll store them with a special marker
214        // so parse_dependencies can find them
215        if matches!(
216            key,
217            "PREREQ_PM" | "BUILD_REQUIRES" | "TEST_REQUIRES" | "CONFIGURE_REQUIRES"
218        ) {
219            fields.insert(format!("_HASH_{}", key), hash_content.to_string());
220        }
221    }
222}
223
224fn parse_author_array(content: &str, fields: &mut HashMap<String, String>) {
225    if let Some(cap) = RE_AUTHOR_ARRAY.captures(content) {
226        let array_content = cap.get(1).expect("group 1 always exists").as_str();
227
228        let authors: Vec<String> = RE_QUOTED_STRING
229            .captures_iter(array_content)
230            .filter_map(|c| c.get(1).map(|m| m.as_str().to_string()))
231            .collect();
232
233        if !authors.is_empty() {
234            // Store as JSON array for later processing
235            fields.insert("_ARRAY_AUTHOR".to_string(), authors.join("||"));
236        }
237    }
238}
239
240fn parse_author(fields: &HashMap<String, String>) -> Vec<Party> {
241    // Check for array of authors first
242    if let Some(authors_str) = fields.get("_ARRAY_AUTHOR") {
243        return authors_str
244            .split("||")
245            .filter_map(|author_str| {
246                if author_str.trim().is_empty() {
247                    return None;
248                }
249                let (name, email) = parse_author_string(author_str);
250                Some(Party {
251                    role: Some("author".to_string()),
252                    name,
253                    email,
254                    r#type: Some("person".to_string()),
255                    url: None,
256                    organization: None,
257                    organization_url: None,
258                    timezone: None,
259                })
260            })
261            .collect();
262    }
263
264    // Single author
265    if let Some(author_str) = fields.get("AUTHOR") {
266        let (name, email) = parse_author_string(author_str);
267        return vec![Party {
268            role: Some("author".to_string()),
269            name,
270            email,
271            r#type: Some("person".to_string()),
272            url: None,
273            organization: None,
274            organization_url: None,
275            timezone: None,
276        }];
277    }
278
279    Vec::new()
280}
281
282fn parse_author_string(s: &str) -> (Option<String>, Option<String>) {
283    // Parse "Name <email@example.com>" format
284    if let Some(start) = s.find('<')
285        && let Some(end) = s.find('>')
286        && start < end
287    {
288        let name = s[..start].trim();
289        let email = s[start + 1..end].trim();
290        return (
291            if name.is_empty() {
292                None
293            } else {
294                Some(name.to_string())
295            },
296            if email.is_empty() {
297                None
298            } else {
299                Some(email.to_string())
300            },
301        );
302    }
303    // No email found, treat entire string as name
304    (Some(s.trim().to_string()), None)
305}
306
307fn parse_dependencies(fields: &HashMap<String, String>) -> Vec<Dependency> {
308    let mut dependencies = Vec::new();
309
310    // Parse PREREQ_PM as runtime dependencies
311    if let Some(hash_content) = fields.get("_HASH_PREREQ_PM") {
312        dependencies.extend(extract_deps_from_hash(hash_content, "runtime", true));
313    }
314
315    // Parse BUILD_REQUIRES
316    if let Some(hash_content) = fields.get("_HASH_BUILD_REQUIRES") {
317        dependencies.extend(extract_deps_from_hash(hash_content, "build", false));
318    }
319
320    // Parse TEST_REQUIRES
321    if let Some(hash_content) = fields.get("_HASH_TEST_REQUIRES") {
322        dependencies.extend(extract_deps_from_hash(hash_content, "test", false));
323    }
324
325    // Parse CONFIGURE_REQUIRES
326    if let Some(hash_content) = fields.get("_HASH_CONFIGURE_REQUIRES") {
327        dependencies.extend(extract_deps_from_hash(hash_content, "configure", false));
328    }
329
330    dependencies
331}
332
333fn extract_deps_from_hash(hash_content: &str, scope: &str, is_runtime: bool) -> Vec<Dependency> {
334    let mut deps = Vec::new();
335
336    for cap in RE_DEP_PAIR.captures_iter(hash_content) {
337        let module_name = cap.get(1).expect("group 1 always exists").as_str();
338
339        // Skip perl itself
340        if module_name == "perl" {
341            continue;
342        }
343
344        let version = cap
345            .get(2)
346            .or_else(|| cap.get(3))
347            .or_else(|| cap.get(4))
348            .map(|m| m.as_str());
349
350        let extracted_requirement = match version {
351            Some("0") | Some("") | None => None,
352            Some(v) => Some(v.to_string()),
353        };
354
355        let purl = PackageUrl::new("cpan", module_name)
356            .ok()
357            .map(|p| p.to_string());
358
359        deps.push(Dependency {
360            purl,
361            extracted_requirement,
362            scope: Some(scope.to_string()),
363            is_runtime: Some(is_runtime),
364            is_optional: Some(false),
365            is_pinned: None,
366            is_direct: Some(true),
367            resolved_package: None,
368            extra_data: None,
369        });
370    }
371
372    deps
373}
374
375crate::register_parser!(
376    "CPAN Perl Makefile.PL",
377    &["*/Makefile.PL"],
378    "cpan",
379    "Perl",
380    Some("https://metacpan.org/pod/ExtUtils::MakeMaker"),
381);