Skip to main content

perl_quote/
lib.rs

1//! Uniform quote operator parsing for the Perl parser.
2//!
3//! This module provides consistent parsing for quote-like operators,
4//! properly extracting patterns, bodies, and modifiers.
5
6use std::borrow::Cow;
7
8/// Extract pattern and modifiers from a regex-like token (qr, m, or bare //)
9pub fn extract_regex_parts(text: &str) -> (String, String, String) {
10    // Handle different prefixes
11    let content = if let Some(stripped) = text.strip_prefix("qr") {
12        stripped
13    } else if text.starts_with('m')
14        && text.len() > 1
15        && text.chars().nth(1).is_some_and(|c| !c.is_alphabetic())
16    {
17        &text[1..]
18    } else {
19        text
20    };
21
22    // Get delimiter - content must be non-empty to have a delimiter
23    let delimiter = match content.chars().next() {
24        Some(d) => d,
25        None => return (String::new(), String::new(), String::new()),
26    };
27    let closing = get_closing_delimiter(delimiter);
28
29    // Extract body and modifiers
30    let (body, modifiers) = extract_delimited_content(content, delimiter, closing);
31
32    // Include delimiters in the pattern string for compatibility
33    let pattern = format!("{}{}{}", delimiter, body, closing);
34
35    (pattern, body, modifiers.to_string())
36}
37
38/// Error type for substitution operator parsing failures
39#[derive(Debug, Clone, PartialEq)]
40pub enum SubstitutionError {
41    /// Invalid modifier character found
42    InvalidModifier(char),
43    /// Missing delimiter after 's'
44    MissingDelimiter,
45    /// Pattern is missing or empty (just `s/`)
46    MissingPattern,
47    /// Replacement section is missing (e.g., `s/pattern` without replacement part)
48    MissingReplacement,
49    /// Closing delimiter is missing after replacement (e.g., `s/pattern/replacement` without final `/`)
50    MissingClosingDelimiter,
51}
52
53/// Extract pattern, replacement, and modifiers from a substitution token with strict validation
54///
55/// This function parses substitution operators like s/pattern/replacement/flags
56/// and handles various delimiter forms including:
57/// - Non-paired delimiters: s/pattern/replacement/ (same delimiter for all parts)
58/// - Paired delimiters: s{pattern}{replacement} (different open/close delimiters)
59///
60/// Unlike `extract_substitution_parts`, this function returns an error if invalid modifiers
61/// are present instead of silently filtering them.
62///
63/// # Errors
64///
65/// Returns `Err(SubstitutionError::InvalidModifier(c))` if an invalid modifier character is found.
66/// Valid modifiers are: g, i, m, s, x, o, e, r
67pub fn extract_substitution_parts_strict(
68    text: &str,
69) -> Result<(String, String, String), SubstitutionError> {
70    // Skip 's' prefix
71    let content = text.strip_prefix('s').unwrap_or(text);
72
73    // Get delimiter - check for missing delimiter (just 's' or 's' followed by nothing)
74    let delimiter = match content.chars().next() {
75        Some(d) => d,
76        None => return Err(SubstitutionError::MissingDelimiter),
77    };
78    let closing = get_closing_delimiter(delimiter);
79    let is_paired = delimiter != closing;
80
81    // Parse first body (pattern) with strict validation
82    let (pattern, rest1, pattern_closed) =
83        extract_delimited_content_strict(content, delimiter, closing);
84
85    // For non-paired delimiters: if pattern wasn't closed, missing closing delimiter
86    if !is_paired && !pattern_closed {
87        return Err(SubstitutionError::MissingClosingDelimiter);
88    }
89
90    // For paired delimiters: if pattern wasn't closed, missing closing delimiter
91    if is_paired && !pattern_closed {
92        return Err(SubstitutionError::MissingClosingDelimiter);
93    }
94
95    // Parse second body (replacement)
96    // For paired delimiters, the replacement may use a different delimiter than the pattern
97    // e.g., s[pattern]{replacement} is valid Perl
98    let (replacement, modifiers_str, replacement_closed) = if !is_paired {
99        // Non-paired delimiters: must have replacement section
100        if rest1.is_empty() {
101            return Err(SubstitutionError::MissingReplacement);
102        }
103
104        // Manually parse the replacement
105        let chars = rest1.char_indices();
106        let mut body = String::new();
107        let mut escaped = false;
108        let mut end_pos = rest1.len();
109        let mut found_closing = false;
110
111        for (i, ch) in chars {
112            if escaped {
113                body.push(ch);
114                escaped = false;
115                continue;
116            }
117
118            match ch {
119                '\\' => {
120                    body.push(ch);
121                    escaped = true;
122                }
123                c if c == closing => {
124                    end_pos = i + ch.len_utf8();
125                    found_closing = true;
126                    break;
127                }
128                _ => body.push(ch),
129            }
130        }
131
132        (body, &rest1[end_pos..], found_closing)
133    } else {
134        // Paired delimiters
135        let trimmed = rest1.trim_start();
136        // For paired delimiters, check what delimiter the replacement uses
137        // It may be the same as pattern or a different paired delimiter
138        // e.g., s[pattern]{replacement} uses [] for pattern and {} for replacement
139        if let Some(rd) = trimmed.chars().next() {
140            // Check if it's a valid paired opening delimiter
141            if rd == '{' || rd == '[' || rd == '(' || rd == '<' {
142                let repl_closing = get_closing_delimiter(rd);
143                extract_delimited_content_strict(trimmed, rd, repl_closing)
144            } else {
145                // Not a valid paired delimiter - malformed
146                return Err(SubstitutionError::MissingReplacement);
147            }
148        } else {
149            // No more content - missing replacement
150            return Err(SubstitutionError::MissingReplacement);
151        }
152    };
153
154    // For non-paired delimiters, must have found the closing delimiter for replacement
155    if !is_paired && !replacement_closed {
156        return Err(SubstitutionError::MissingClosingDelimiter);
157    }
158
159    // For paired delimiters, must have found the closing delimiter for replacement
160    if is_paired && !replacement_closed {
161        return Err(SubstitutionError::MissingClosingDelimiter);
162    }
163
164    // Validate modifiers strictly - reject if any invalid modifiers present
165    let modifiers = validate_substitution_modifiers(modifiers_str)
166        .map_err(SubstitutionError::InvalidModifier)?;
167
168    Ok((pattern, replacement, modifiers))
169}
170
171/// Extract content between delimiters with strict tracking of whether closing was found.
172/// Returns (content, rest, found_closing).
173fn extract_delimited_content_strict(text: &str, open: char, close: char) -> (String, &str, bool) {
174    let mut chars = text.char_indices();
175    let is_paired = open != close;
176
177    // Skip opening delimiter
178    if let Some((_, c)) = chars.next() {
179        if c != open {
180            return (String::new(), text, false);
181        }
182    } else {
183        return (String::new(), "", false);
184    }
185
186    let mut body = String::new();
187    let mut depth = if is_paired { 1 } else { 0 };
188    let mut escaped = false;
189    let mut end_pos = text.len();
190    let mut found_closing = false;
191
192    for (i, ch) in chars {
193        if escaped {
194            body.push(ch);
195            escaped = false;
196            continue;
197        }
198
199        match ch {
200            '\\' => {
201                body.push(ch);
202                escaped = true;
203            }
204            c if c == open && is_paired => {
205                body.push(ch);
206                depth += 1;
207            }
208            c if c == close => {
209                if is_paired {
210                    depth -= 1;
211                    if depth == 0 {
212                        end_pos = i + ch.len_utf8();
213                        found_closing = true;
214                        break;
215                    }
216                    body.push(ch);
217                } else {
218                    end_pos = i + ch.len_utf8();
219                    found_closing = true;
220                    break;
221                }
222            }
223            _ => body.push(ch),
224        }
225    }
226
227    (body, &text[end_pos..], found_closing)
228}
229
230/// Extract pattern, replacement, and modifiers from a substitution token
231///
232/// This function parses substitution operators like s/pattern/replacement/flags
233/// and handles various delimiter forms including:
234/// - Non-paired delimiters: s/pattern/replacement/ (same delimiter for all parts)
235/// - Paired delimiters: s{pattern}{replacement} (different open/close delimiters)
236///
237/// For paired delimiters, properly handles nested delimiters within the pattern
238/// or replacement parts. Returns (pattern, replacement, modifiers) as strings.
239///
240/// Note: This function silently filters invalid modifiers. For strict validation,
241/// use `extract_substitution_parts_strict` instead.
242pub fn extract_substitution_parts(text: &str) -> (String, String, String) {
243    // Skip 's' prefix
244    let content = text.strip_prefix('s').unwrap_or(text);
245
246    // Get delimiter - content must be non-empty to have a delimiter
247    let delimiter = match content.chars().next() {
248        Some(d) => d,
249        None => return (String::new(), String::new(), String::new()),
250    };
251    if delimiter.is_ascii_alphanumeric() || delimiter.is_whitespace() {
252        if let Some((pattern, replacement, modifiers_str)) = split_on_last_paired_delimiter(content)
253        {
254            let modifiers = extract_substitution_modifiers(&modifiers_str);
255            return (pattern, replacement, modifiers);
256        }
257
258        return (String::new(), String::new(), String::new());
259    }
260    let closing = get_closing_delimiter(delimiter);
261    let is_paired = delimiter != closing;
262
263    // Parse first body (pattern)
264    let (mut pattern, rest1, pattern_closed) = if is_paired {
265        extract_substitution_pattern_with_replacement_hint(content, delimiter, closing)
266    } else {
267        extract_delimited_content_strict(content, delimiter, closing)
268    };
269
270    // Parse second body (replacement)
271    // For paired delimiters, the replacement may use a different delimiter than the pattern
272    // e.g., s[pattern]{replacement} is valid Perl
273    let (replacement, modifiers_str) = if !is_paired && !rest1.is_empty() {
274        // Non-paired delimiters: manually parse the replacement
275        let (body, rest) = extract_unpaired_body(rest1, closing);
276        (body, Cow::Borrowed(rest))
277    } else if !is_paired && !pattern_closed {
278        if let Some((fallback_pattern, fallback_replacement, fallback_modifiers)) =
279            split_unclosed_substitution_pattern(&pattern)
280        {
281            pattern = fallback_pattern;
282            (fallback_replacement, Cow::Owned(fallback_modifiers))
283        } else {
284            (String::new(), Cow::Borrowed(rest1))
285        }
286    } else if is_paired {
287        let trimmed = rest1.trim_start();
288        // For paired delimiters, check what delimiter the replacement uses
289        // It may be the same as pattern or a different paired delimiter
290        // e.g., s[pattern]{replacement} uses [] for pattern and {} for replacement
291        if let Some(rd) = starts_with_paired_delimiter(trimmed) {
292            let repl_closing = get_closing_delimiter(rd);
293            let (body, rest) = extract_delimited_content(trimmed, rd, repl_closing);
294            (body, Cow::Borrowed(rest))
295        } else {
296            let (body, rest) = extract_unpaired_body(rest1, closing);
297            (body, Cow::Borrowed(rest))
298        }
299    } else {
300        (String::new(), Cow::Borrowed(rest1))
301    };
302
303    // Extract and validate only valid substitution modifiers
304    let modifiers = extract_substitution_modifiers(modifiers_str.as_ref());
305
306    (pattern, replacement, modifiers)
307}
308
309/// Extract search, replace, and modifiers from a transliteration token
310pub fn extract_transliteration_parts(text: &str) -> (String, String, String) {
311    // Skip 'tr' or 'y' prefix
312    let content = if let Some(stripped) = text.strip_prefix("tr") {
313        stripped
314    } else if let Some(stripped) = text.strip_prefix('y') {
315        stripped
316    } else {
317        text
318    };
319
320    // Get delimiter - content must be non-empty to have a delimiter
321    let delimiter = match content.chars().next() {
322        Some(d) => d,
323        None => return (String::new(), String::new(), String::new()),
324    };
325    let closing = get_closing_delimiter(delimiter);
326    let is_paired = delimiter != closing;
327
328    // Parse first body (search pattern)
329    let (search, rest1) = extract_delimited_content(content, delimiter, closing);
330
331    // For paired delimiters, skip whitespace and expect new delimiter
332    let rest2_owned;
333    let rest2 = if is_paired {
334        let trimmed = rest1.trim_start();
335        // For paired delimiters like tr{search}{replace}, we expect another opening delimiter
336        if trimmed.starts_with(delimiter) {
337            // Keep the delimiter - don't strip it since extract_delimited_content expects it
338            trimmed
339        } else {
340            // If no second delimiter found, the replacement is empty
341            ""
342        }
343    } else {
344        rest2_owned = format!("{}{}", delimiter, rest1);
345        &rest2_owned
346    };
347
348    // Parse second body (replacement pattern)
349    let (replacement, modifiers_str) = if !is_paired && !rest1.is_empty() {
350        // Manually parse the replacement for non-paired delimiters
351        let chars = rest1.char_indices();
352        let mut body = String::new();
353        let mut escaped = false;
354        let mut end_pos = rest1.len();
355
356        for (i, ch) in chars {
357            if escaped {
358                body.push(ch);
359                escaped = false;
360                continue;
361            }
362
363            match ch {
364                '\\' => {
365                    body.push(ch);
366                    escaped = true;
367                }
368                c if c == closing => {
369                    end_pos = i + ch.len_utf8();
370                    break;
371                }
372                _ => body.push(ch),
373            }
374        }
375
376        (body, &rest1[end_pos..])
377    } else if is_paired {
378        extract_delimited_content(rest2, delimiter, closing)
379    } else {
380        (String::new(), rest1)
381    };
382
383    // Extract and validate only valid transliteration modifiers
384    // Security fix: Apply consistent validation for all delimiter types
385    let modifiers = modifiers_str
386        .chars()
387        .take_while(|c| c.is_ascii_alphabetic())
388        .filter(|&c| matches!(c, 'c' | 'd' | 's' | 'r'))
389        .collect();
390
391    (search, replacement, modifiers)
392}
393
394/// Get the closing delimiter for a given opening delimiter
395fn get_closing_delimiter(open: char) -> char {
396    match open {
397        '(' => ')',
398        '[' => ']',
399        '{' => '}',
400        '<' => '>',
401        _ => open,
402    }
403}
404
405fn is_paired_open(ch: char) -> bool {
406    matches!(ch, '{' | '[' | '(' | '<')
407}
408
409fn starts_with_paired_delimiter(text: &str) -> Option<char> {
410    let trimmed = text.trim_start();
411    match trimmed.chars().next() {
412        Some(ch) if is_paired_open(ch) => Some(ch),
413        _ => None,
414    }
415}
416
417/// Extract content between delimiters and return (content, rest)
418fn extract_delimited_content(text: &str, open: char, close: char) -> (String, &str) {
419    let mut chars = text.char_indices();
420    let is_paired = open != close;
421
422    // Skip opening delimiter
423    if let Some((_, c)) = chars.next() {
424        if c != open {
425            return (String::new(), text);
426        }
427    } else {
428        return (String::new(), "");
429    }
430
431    let mut body = String::new();
432    let mut depth = if is_paired { 1 } else { 0 };
433    let mut escaped = false;
434    let mut end_pos = text.len();
435
436    for (i, ch) in chars {
437        if escaped {
438            body.push(ch);
439            escaped = false;
440            continue;
441        }
442
443        match ch {
444            '\\' => {
445                body.push(ch);
446                escaped = true;
447            }
448            c if c == open && is_paired => {
449                body.push(ch);
450                depth += 1;
451            }
452            c if c == close => {
453                if is_paired {
454                    depth -= 1;
455                    if depth == 0 {
456                        end_pos = i + ch.len_utf8();
457                        break;
458                    }
459                    body.push(ch);
460                } else {
461                    end_pos = i + ch.len_utf8();
462                    break;
463                }
464            }
465            _ => body.push(ch),
466        }
467    }
468
469    (body, &text[end_pos..])
470}
471
472fn extract_unpaired_body(text: &str, closing: char) -> (String, &str) {
473    let mut body = String::new();
474    let mut escaped = false;
475    let mut end_pos = text.len();
476
477    for (i, ch) in text.char_indices() {
478        if escaped {
479            body.push(ch);
480            escaped = false;
481            continue;
482        }
483
484        match ch {
485            '\\' => {
486                body.push(ch);
487                escaped = true;
488            }
489            c if c == closing => {
490                end_pos = i + ch.len_utf8();
491                break;
492            }
493            _ => body.push(ch),
494        }
495    }
496
497    (body, &text[end_pos..])
498}
499
500fn extract_substitution_pattern_with_replacement_hint(
501    text: &str,
502    open: char,
503    close: char,
504) -> (String, &str, bool) {
505    let mut chars = text.char_indices();
506
507    // Skip opening delimiter
508    if let Some((_, c)) = chars.next() {
509        if c != open {
510            return (String::new(), text, false);
511        }
512    } else {
513        return (String::new(), "", false);
514    }
515
516    let mut body = String::new();
517    let mut depth = 1usize;
518    let mut escaped = false;
519    let mut first_close_pos: Option<usize> = None;
520    let mut first_body_len: usize = 0;
521
522    for (i, ch) in chars {
523        if escaped {
524            body.push(ch);
525            escaped = false;
526            continue;
527        }
528
529        match ch {
530            '\\' => {
531                body.push(ch);
532                escaped = true;
533            }
534            c if c == open => {
535                body.push(ch);
536                depth += 1;
537            }
538            c if c == close => {
539                if depth > 1 {
540                    depth -= 1;
541                    body.push(ch);
542                    continue;
543                }
544
545                let rest = &text[i + ch.len_utf8()..];
546                if first_close_pos.is_none() {
547                    first_close_pos = Some(i + ch.len_utf8());
548                    first_body_len = body.len();
549                }
550
551                if starts_with_paired_delimiter(rest).is_some() {
552                    return (body, rest, true);
553                }
554
555                body.push(ch);
556            }
557            _ => body.push(ch),
558        }
559    }
560
561    if let Some(pos) = first_close_pos {
562        body.truncate(first_body_len);
563        return (body, &text[pos..], true);
564    }
565
566    (body, "", false)
567}
568
569fn split_unclosed_substitution_pattern(pattern: &str) -> Option<(String, String, String)> {
570    let mut escaped = false;
571
572    for (idx, ch) in pattern.char_indices() {
573        if escaped {
574            escaped = false;
575            continue;
576        }
577
578        if ch == '\\' {
579            escaped = true;
580            continue;
581        }
582
583        if is_paired_open(ch) {
584            let closing = get_closing_delimiter(ch);
585            let (replacement, rest, found_closing) =
586                extract_delimited_content_strict(&pattern[idx..], ch, closing);
587            if found_closing {
588                let leading = pattern[..idx].to_string();
589                return Some((leading, replacement, rest.to_string()));
590            }
591        }
592    }
593
594    None
595}
596
597fn split_on_last_paired_delimiter(text: &str) -> Option<(String, String, String)> {
598    let mut escaped = false;
599    let mut candidates = Vec::new();
600
601    for (idx, ch) in text.char_indices() {
602        if escaped {
603            escaped = false;
604            continue;
605        }
606
607        if ch == '\\' {
608            escaped = true;
609            continue;
610        }
611
612        if is_paired_open(ch) {
613            candidates.push((idx, ch));
614        }
615    }
616
617    for (idx, ch) in candidates.into_iter().rev() {
618        let closing = get_closing_delimiter(ch);
619        let (replacement, rest, found_closing) =
620            extract_delimited_content_strict(&text[idx..], ch, closing);
621        if found_closing {
622            let leading = text[..idx].to_string();
623            return Some((leading, replacement, rest.to_string()));
624        }
625    }
626
627    None
628}
629
630/// Extract and validate substitution modifiers, returning only valid ones
631///
632/// Valid Perl substitution modifiers include:
633/// - Core modifiers: g, i, m, s, x, o, e, r
634/// - Charset modifiers (Perl 5.14+): a, d, l, u
635/// - Additional modifiers: n (5.22+), p, c
636///
637/// This function provides panic-safe modifier validation for substitution operators,
638/// filtering out invalid modifiers to prevent security vulnerabilities.
639fn extract_substitution_modifiers(text: &str) -> String {
640    text.chars()
641        .take_while(|c| c.is_ascii_alphabetic())
642        .filter(|&c| {
643            matches!(
644                c,
645                'g' | 'i'
646                    | 'm'
647                    | 's'
648                    | 'x'
649                    | 'o'
650                    | 'e'
651                    | 'r'
652                    | 'a'
653                    | 'd'
654                    | 'l'
655                    | 'u'
656                    | 'n'
657                    | 'p'
658                    | 'c'
659            )
660        })
661        .collect()
662}
663
664/// Validate substitution modifiers and return an error if any are invalid
665///
666/// Valid Perl substitution modifiers include:
667/// - Core modifiers: g, i, m, s, x, o, e, r
668/// - Charset modifiers (Perl 5.14+): a, d, l, u
669/// - Additional modifiers: n (5.22+), p, c
670///
671/// # Arguments
672///
673/// * `modifiers_str` - The raw modifier string following the substitution operator
674///
675/// # Returns
676///
677/// * `Ok(String)` - The validated modifiers if all are valid
678/// * `Err(char)` - The first invalid modifier character encountered
679///
680/// # Examples
681///
682/// ```ignore
683/// assert!(validate_substitution_modifiers("gi").is_ok());
684/// assert!(validate_substitution_modifiers("gia").is_ok());  // 'a' for ASCII mode
685/// assert!(validate_substitution_modifiers("giz").is_err()); // 'z' is invalid
686/// ```
687pub fn validate_substitution_modifiers(modifiers_str: &str) -> Result<String, char> {
688    let mut valid_modifiers = String::new();
689
690    for c in modifiers_str.chars() {
691        // Stop at non-alphabetic characters (end of modifiers)
692        if !c.is_ascii_alphabetic() {
693            // If it's whitespace or end of input, that's ok
694            if c.is_whitespace() || c == ';' || c == '\n' || c == '\r' {
695                break;
696            }
697            // Non-alphabetic, non-whitespace character in modifier position is invalid
698            return Err(c);
699        }
700
701        // Check if it's a valid substitution modifier
702        if matches!(
703            c,
704            'g' | 'i' | 'm' | 's' | 'x' | 'o' | 'e' | 'r' | 'a' | 'd' | 'l' | 'u' | 'n' | 'p' | 'c'
705        ) {
706            valid_modifiers.push(c);
707        } else {
708            // Invalid alphabetic modifier
709            return Err(c);
710        }
711    }
712
713    Ok(valid_modifiers)
714}