perl-parser 0.13.1

Native Perl parser (v3) — recursive descent with Tree-sitter-compatible AST, semantic analysis, and LSP provider engine
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
//! Subroutine inlining for Perl code.
//!
//! Provides text-based subroutine inlining that replaces a call site with
//! the function's body after substituting formal parameters with the actual
//! arguments from the call.
//!
//! # Limitations
//!
//! This is a text-pattern implementation. It does not build a full AST and
//! therefore relies on heuristics for:
//! - Parameter extraction (assumes `my ($a, $b, …) = @_;` pattern)
//! - Return detection (counts `return` keywords)
//! - Recursion detection (looks for the sub name inside the body)
//! - Side-effect detection (looks for `print`, `warn`, `die`, `open`, `close`,
//!   `write`, `read`, `seek`, `sysread`, `syswrite`)
//!
//! Functions that do not follow these conventions may not be inlined correctly.
//! The safe defaults are to **reject** when uncertain (recursion, large body,
//! multiple returns) and to **warn** when side effects are detected.

use std::collections::HashMap;

/// Maximum number of body lines before the inliner rejects the function.
const MAX_BODY_LINES: usize = 50;

// ---------------------------------------------------------------------------
// Error type
// ---------------------------------------------------------------------------

/// Error type returned by subroutine inlining operations.
#[derive(Debug, Clone)]
pub enum InlineError {
    /// The target subroutine was not found in the provided source.
    SubNotFound {
        /// Name of the subroutine that was searched for.
        name: String,
    },
    /// The subroutine calls itself (direct recursion) and cannot be inlined.
    Recursive {
        /// Name of the recursive subroutine.
        name: String,
    },
    /// The subroutine body has too many lines to inline safely.
    TooLarge {
        /// Name of the subroutine.
        name: String,
        /// Actual line count of the body.
        line_count: usize,
    },
    /// The subroutine has more than one `return` statement, which requires
    /// control-flow restructuring beyond simple text substitution.
    MultipleReturns {
        /// Name of the subroutine.
        name: String,
        /// Number of `return` statements found.
        count: usize,
    },
    /// The call site expression could not be parsed (wrong argument count, etc.).
    CallSiteParseFailed {
        /// Diagnostic message.
        message: String,
    },
}

impl std::fmt::Display for InlineError {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            InlineError::SubNotFound { name } => {
                write!(f, "subroutine '{}' not found in source", name)
            }
            InlineError::Recursive { name } => {
                write!(f, "cannot inline recursive subroutine '{}'", name)
            }
            InlineError::TooLarge { name, line_count } => {
                write!(
                    f,
                    "subroutine '{}' is too large to inline ({} lines, max {})",
                    name, line_count, MAX_BODY_LINES
                )
            }
            InlineError::MultipleReturns { name, count } => {
                write!(
                    f,
                    "subroutine '{}' has {} return points; only single-return subs can be inlined",
                    name, count
                )
            }
            InlineError::CallSiteParseFailed { message } => {
                write!(f, "failed to parse call site: {}", message)
            }
        }
    }
}

impl std::error::Error for InlineError {}

// ---------------------------------------------------------------------------
// Analysis result
// ---------------------------------------------------------------------------

/// The result of analysing a subroutine's inlineability.
#[derive(Debug, Clone)]
pub enum InlineAbility {
    /// The subroutine can be inlined.
    Ok {
        /// Formal parameter names (without sigils) in declaration order.
        params: Vec<String>,
        /// The body text, stripped of the parameter-extraction line.
        body: String,
        /// Whether the body contains operations with observable side effects.
        has_side_effects: bool,
    },
}

// ---------------------------------------------------------------------------
// Public API
// ---------------------------------------------------------------------------

/// Analyse whether a named subroutine can be inlined.
///
/// Returns `Ok(InlineAbility::Ok { … })` when safe to inline, or an
/// [`InlineError`] when the subroutine must not be inlined.
pub fn analyze_sub_for_inlining(
    source: &str,
    sub_name: &str,
) -> Result<InlineAbility, InlineError> {
    let parsed = parse_sub_definition(source, sub_name)
        .ok_or_else(|| InlineError::SubNotFound { name: sub_name.to_string() })?;

    // Recursion check
    if body_calls_self(&parsed.body, sub_name) {
        return Err(InlineError::Recursive { name: sub_name.to_string() });
    }

    // Size check
    let body_line_count = parsed.body.lines().count();
    if body_line_count > MAX_BODY_LINES {
        return Err(InlineError::TooLarge {
            name: sub_name.to_string(),
            line_count: body_line_count,
        });
    }

    // Multiple-return check
    let return_count = count_return_statements(&parsed.body);
    if return_count > 1 {
        return Err(InlineError::MultipleReturns {
            name: sub_name.to_string(),
            count: return_count,
        });
    }

    let side_effects = has_side_effects(&parsed.body);

    Ok(InlineAbility::Ok {
        params: parsed.params,
        body: parsed.body,
        has_side_effects: side_effects,
    })
}

/// Text-based Perl subroutine inliner.
///
/// Create one instance per source file and call [`inline_call`] (or its
/// variants) to produce the inlined expression text.
pub struct SubInliner {
    source: String,
}

impl SubInliner {
    /// Create a new inliner from Perl source text.
    pub fn new(source: &str) -> Self {
        Self { source: source.to_string() }
    }

    /// Inline a single call to `sub_name`.
    ///
    /// `call_expr` is the full call expression string, e.g. `"add(3, 4)"`.
    ///
    /// Returns the replacement text (the inlined expression), or an
    /// [`InlineError`] if the subroutine cannot be inlined.
    pub fn inline_call(&self, sub_name: &str, call_expr: &str) -> Result<String, InlineError> {
        let (inlined, _warnings) = self.inline_call_inner(sub_name, call_expr, &[])?;
        Ok(inlined)
    }

    /// Like [`inline_call`] but also returns any warnings (e.g. side effects).
    pub fn inline_call_with_warnings(
        &self,
        sub_name: &str,
        call_expr: &str,
    ) -> Result<(String, Vec<String>), InlineError> {
        self.inline_call_inner(sub_name, call_expr, &[])
    }

    /// Like [`inline_call`] but accepts a list of variable names that already
    /// exist in the outer scope, so collisions can be detected and renamed.
    pub fn inline_call_with_outer_vars(
        &self,
        sub_name: &str,
        call_expr: &str,
        outer_vars: &[String],
    ) -> Result<String, InlineError> {
        let (inlined, _warnings) = self.inline_call_inner(sub_name, call_expr, outer_vars)?;
        Ok(inlined)
    }

    // ------------------------------------------------------------------
    // Internal
    // ------------------------------------------------------------------

    fn inline_call_inner(
        &self,
        sub_name: &str,
        call_expr: &str,
        outer_vars: &[String],
    ) -> Result<(String, Vec<String>), InlineError> {
        let ability = analyze_sub_for_inlining(&self.source, sub_name)?;
        let InlineAbility::Ok { params, body, has_side_effects } = ability;

        let mut warnings = Vec::new();
        if has_side_effects {
            warnings.push(format!(
                "subroutine '{}' contains side effects (print/warn/die/I/O); \
                 inlining preserves them but may change semantics",
                sub_name
            ));
        }

        // Extract arguments from call expression
        let args = extract_call_args(call_expr, sub_name)?;

        // Build substitution map: param_name -> arg_text
        let mut sub_map: HashMap<String, String> = HashMap::new();
        for (i, param) in params.iter().enumerate() {
            let arg = args.get(i).cloned().unwrap_or_default();
            sub_map.insert(param.clone(), arg);
        }

        // Rename local variables to avoid outer-scope collisions
        let body = rename_collisions(&body, outer_vars);

        // Substitute parameters in body
        let substituted = substitute_params(&body, &sub_map);

        // Extract the return expression from the body
        let expr = extract_return_expr(&substituted);

        Ok((expr, warnings))
    }
}

// ---------------------------------------------------------------------------
// Parsing helpers
// ---------------------------------------------------------------------------

/// Parsed representation of a subroutine definition extracted from source.
struct ParsedSub {
    /// Formal parameter names (sigils stripped).
    params: Vec<String>,
    /// Body text with the parameter line removed.
    body: String,
}

/// Extract a subroutine definition from source text.
///
/// Recognises the pattern:
/// ```text
/// sub NAME {
///     …body…
/// }
/// ```
///
/// Returns `None` if the pattern is not found.
fn parse_sub_definition(source: &str, sub_name: &str) -> Option<ParsedSub> {
    let start = find_sub_start(source, sub_name)?;

    // Find the matching closing brace
    let body_start = source[start..].find('{').map(|i| start + i + 1)?;
    let body_raw = extract_balanced_braces(source, body_start)?;

    // Extract parameter line: "my ($a, $b) = @_;"
    let (params, body_without_params) = extract_params_line(&body_raw);

    Some(ParsedSub { params, body: body_without_params })
}

/// Find the byte offset of `sub NAME` followed by `{` in `source`.
fn find_sub_start(source: &str, sub_name: &str) -> Option<usize> {
    let mut pos = 0;
    while pos < source.len() {
        let rest = &source[pos..];
        if let Some(idx) = rest.find("sub ") {
            let after_sub = &rest[idx + 4..];
            let trimmed = after_sub.trim_start();
            if let Some(after_name) = trimmed.strip_prefix(sub_name) {
                // Verify it's a word boundary (not "sub foobar" when looking for "foo")
                let boundary_ok =
                    after_name.chars().next().is_none_or(|c| !c.is_alphanumeric() && c != '_');
                if boundary_ok && after_name.trim_start().starts_with('{') {
                    return Some(pos + idx);
                }
            }
            pos += idx + 4;
        } else {
            break;
        }
    }
    None
}

/// Extract the text between a matching pair of braces starting at `open_pos`
/// (the position AFTER the opening `{`).
fn extract_balanced_braces(source: &str, open_pos: usize) -> Option<String> {
    let mut depth = 1usize;
    let chars: Vec<char> = source[open_pos..].chars().collect();
    let mut end = 0;
    let mut found = false;
    let mut i = 0;
    while i < chars.len() {
        match chars[i] {
            '{' => depth += 1,
            '}' => {
                depth -= 1;
                if depth == 0 {
                    end = i;
                    found = true;
                    break;
                }
            }
            _ => {}
        }
        i += 1;
    }
    if !found {
        return None;
    }
    Some(chars[..end].iter().collect())
}

/// Parse out the Perl parameter-extraction line `my ($a, $b) = @_;` from the
/// top of the body, returning (params, remaining_body).
///
/// If no such line is found, returns ([], original_body).
fn extract_params_line(body: &str) -> (Vec<String>, String) {
    for (i, line) in body.lines().enumerate() {
        let trimmed = line.trim();
        if trimmed.starts_with("my (") && trimmed.contains("= @_") {
            let params = parse_param_names(trimmed);
            let remaining: String = body
                .lines()
                .enumerate()
                .filter(|(j, _)| *j != i)
                .map(|(_, l)| l)
                .collect::<Vec<_>>()
                .join("\n");
            return (params, remaining);
        }
    }
    (vec![], body.to_string())
}

/// Extract parameter names from `my ($a, $b) = @_;`, returning bare names
/// without sigils.
fn parse_param_names(line: &str) -> Vec<String> {
    let open = match line.find('(') {
        Some(i) => i,
        None => return vec![],
    };
    let close = match line.rfind(')') {
        Some(i) => i,
        None => return vec![],
    };
    if close <= open {
        return vec![];
    }
    let inner = &line[open + 1..close];
    inner
        .split(',')
        .map(|s| s.trim().trim_start_matches(['$', '@', '%']).to_string())
        .filter(|s| !s.is_empty())
        .collect()
}

// ---------------------------------------------------------------------------
// Body analysis helpers
// ---------------------------------------------------------------------------

/// Count occurrences of the `return` keyword as a standalone token in `body`.
///
/// Skips occurrences inside single- or double-quoted string literals so that
/// `my $msg = "will return a value";` is not counted as a return statement.
fn count_return_statements(body: &str) -> usize {
    let mut count = 0usize;
    let mut pos = 0;
    let mut in_single_quote = false;
    let mut in_double_quote = false;
    let bytes = body.as_bytes();

    while pos < body.len() {
        let b = bytes[pos];

        // Track string context — handle backslash escapes
        match b {
            b'\\' if in_single_quote || in_double_quote => {
                // Skip escaped character
                pos += 2;
                continue;
            }
            b'\'' if !in_double_quote => {
                in_single_quote = !in_single_quote;
                pos += 1;
                continue;
            }
            b'"' if !in_single_quote => {
                in_double_quote = !in_double_quote;
                pos += 1;
                continue;
            }
            _ => {}
        }

        // Only count `return` tokens outside string literals
        if !in_single_quote && !in_double_quote {
            let rest = &body[pos..];
            if rest.starts_with("return") {
                // Check character before
                let before_ok = if pos > 0 {
                    let prev = bytes[pos - 1];
                    !prev.is_ascii_alphanumeric() && prev != b'_'
                } else {
                    true
                };
                // Check character after
                let after_pos = pos + 6;
                let after_ok = if after_pos < body.len() {
                    let next = bytes[after_pos];
                    !next.is_ascii_alphanumeric() && next != b'_'
                } else {
                    true
                };
                if before_ok && after_ok {
                    count += 1;
                }
                pos += 6;
                continue;
            }
        }

        pos += body[pos..].chars().next().map_or(1, |c| c.len_utf8());
    }
    count
}

/// Check whether the body contains observable side-effect operations.
fn has_side_effects(body: &str) -> bool {
    const SIDE_EFFECT_KEYWORDS: &[&str] = &[
        "print ", "warn ", "die ", "open ", "close ", "read ", "write ", "seek ", "sysread",
        "syswrite", "printf", "say ",
    ];
    SIDE_EFFECT_KEYWORDS.iter().any(|kw| body.contains(kw))
}

/// Check whether the body calls itself (direct recursion).
///
/// Skips occurrences of `sub_name(` that appear inside string literals to
/// avoid false-positive recursion detection when the sub name is merely
/// mentioned in a string (e.g. `my $msg = "add(1,2) adds two numbers"`).
fn body_calls_self(body: &str, sub_name: &str) -> bool {
    let call_pattern = format!("{}(", sub_name);
    let bytes = body.as_bytes();
    let mut pos = 0;
    let mut in_single_quote = false;
    let mut in_double_quote = false;

    while pos < body.len() {
        let b = bytes[pos];
        match b {
            b'\\' if in_single_quote || in_double_quote => {
                pos += 2;
                continue;
            }
            b'\'' if !in_double_quote => {
                in_single_quote = !in_single_quote;
                pos += 1;
                continue;
            }
            b'"' if !in_single_quote => {
                in_double_quote = !in_double_quote;
                pos += 1;
                continue;
            }
            _ => {}
        }
        if !in_single_quote && !in_double_quote && body[pos..].starts_with(&call_pattern) {
            return true;
        }
        pos += body[pos..].chars().next().map_or(1, |c| c.len_utf8());
    }
    false
}

// ---------------------------------------------------------------------------
// Argument extraction
// ---------------------------------------------------------------------------

/// Extract the argument list from a call expression like `foo(1, 2, "bar")`.
fn extract_call_args(call_expr: &str, sub_name: &str) -> Result<Vec<String>, InlineError> {
    let sub_pos = call_expr.find(sub_name).ok_or_else(|| InlineError::CallSiteParseFailed {
        message: format!("call expression does not contain sub name '{}'", sub_name),
    })?;

    let after_name_pos = sub_pos + sub_name.len();
    let rest = call_expr[after_name_pos..].trim_start();
    if !rest.starts_with('(') {
        // Bare call with no parens — no arguments
        return Ok(vec![]);
    }

    // Find '(' absolute position
    let paren_offset = call_expr[after_name_pos..].find('(').unwrap_or(0);
    let open_abs = after_name_pos + paren_offset;

    let close_abs = find_matching_paren(call_expr, open_abs).ok_or_else(|| {
        InlineError::CallSiteParseFailed {
            message: "unmatched parenthesis in call expression".to_string(),
        }
    })?;

    let args_str = &call_expr[open_abs + 1..close_abs];
    if args_str.trim().is_empty() {
        return Ok(vec![]);
    }

    Ok(split_args(args_str))
}

/// Find the matching `)` for the `(` at byte position `open` in `s`.
fn find_matching_paren(s: &str, open: usize) -> Option<usize> {
    let bytes = s.as_bytes();
    if bytes.get(open) != Some(&b'(') {
        return None;
    }
    let mut depth = 0usize;
    for (i, &b) in bytes.iter().enumerate().skip(open) {
        match b {
            b'(' => depth += 1,
            b')' => {
                depth -= 1;
                if depth == 0 {
                    return Some(i);
                }
            }
            _ => {}
        }
    }
    None
}

/// Split a comma-separated argument string, respecting nested parens and quotes.
fn split_args(args_str: &str) -> Vec<String> {
    let mut result = Vec::new();
    let mut current = String::new();
    let mut depth = 0usize;
    let mut in_single_quote = false;
    let mut in_double_quote = false;
    let chars: Vec<char> = args_str.chars().collect();
    let mut i = 0;

    while i < chars.len() {
        let c = chars[i];
        match c {
            '\\' if in_double_quote || in_single_quote => {
                current.push(c);
                i += 1;
                if i < chars.len() {
                    current.push(chars[i]);
                }
            }
            '\'' if !in_double_quote => {
                in_single_quote = !in_single_quote;
                current.push(c);
            }
            '"' if !in_single_quote => {
                in_double_quote = !in_double_quote;
                current.push(c);
            }
            '(' | '[' | '{' if !in_single_quote && !in_double_quote => {
                depth += 1;
                current.push(c);
            }
            ')' | ']' | '}' if !in_single_quote && !in_double_quote => {
                depth = depth.saturating_sub(1);
                current.push(c);
            }
            ',' if depth == 0 && !in_single_quote && !in_double_quote => {
                result.push(current.trim().to_string());
                current = String::new();
            }
            _ => current.push(c),
        }
        i += 1;
    }

    if !current.trim().is_empty() {
        result.push(current.trim().to_string());
    }

    result
}

// ---------------------------------------------------------------------------
// Body transformation
// ---------------------------------------------------------------------------

/// Replace occurrences of `$param_name` in `body` with the corresponding
/// argument text.
///
/// Uses word-boundary-aware replacement to avoid corrupting longer variable
/// names that share a prefix with a parameter (e.g. replacing `$price` must
/// not corrupt `$price_adjusted`).  Sorted by descending name length so that
/// longer names are never shadowed by shorter prefix matches.
fn substitute_params(body: &str, sub_map: &HashMap<String, String>) -> String {
    let mut result = body.to_string();
    let mut pairs: Vec<(&String, &String)> = sub_map.iter().collect();
    pairs.sort_by(|a, b| b.0.len().cmp(&a.0.len()));

    for (param, arg) in pairs {
        let var = format!("${}", param);
        result = replace_whole_var(&result, &var, arg);
    }
    result
}

/// Rename local variable declarations in `body` that collide with names in
/// `outer_vars`, appending `_inlined` to the bare name.
fn rename_collisions(body: &str, outer_vars: &[String]) -> String {
    let mut result = body.to_string();
    for outer in outer_vars {
        let bare = outer.trim_start_matches(['$', '@', '%']);
        let my_decl = format!("my ${}", bare);
        if result.contains(&my_decl) {
            let renamed_bare = format!("{}_inlined", bare);
            let renamed_decl = format!("my ${}", renamed_bare);
            // Replace the declaration first — use word-boundary-aware replacement so
            // that "my $x" does not corrupt "my $x_count" when the outer var is "$x".
            result = replace_whole_var(&result, &my_decl, &renamed_decl);
            // Then replace all uses of $bare that are not the new $bare_inlined
            // We do this by replacing "$bare" with "$bare_inlined" across the body,
            // but we already renamed the declaration above so the decl is safe.
            let var = format!("${}", bare);
            let renamed_var = format!("${}", renamed_bare);
            // Only replace if not already part of a longer name
            result = replace_whole_var(&result, &var, &renamed_var);
        }
    }
    result
}

/// Replace occurrences of `var` in `text` that are complete variable
/// references (not a prefix of a longer variable name).
fn replace_whole_var(text: &str, var: &str, replacement: &str) -> String {
    let mut result = String::with_capacity(text.len());
    let mut pos = 0;
    while pos < text.len() {
        if text[pos..].starts_with(var) {
            let after = pos + var.len();
            let next_is_alphanum =
                text[after..].chars().next().is_some_and(|c| c.is_alphanumeric() || c == '_');
            if !next_is_alphanum {
                result.push_str(replacement);
                pos = after;
                continue;
            }
        }
        let c = text[pos..].chars().next().unwrap_or('\0');
        result.push(c);
        pos += c.len_utf8();
    }
    result
}

/// Extract the expression value from a body containing a single `return`.
///
/// Returns `(expr)` for `return expr;`, or the trimmed body if no `return`.
fn extract_return_expr(body: &str) -> String {
    for line in body.lines() {
        let trimmed = line.trim();
        if trimmed.starts_with("return ") {
            let expr = trimmed.trim_start_matches("return ").trim_end_matches(';').trim();
            return format!("({})", expr);
        }
    }
    body.trim().to_string()
}