Skip to main content

mago_syntax_core/
utils.rs

1use bumpalo::Bump;
2use bumpalo::collections::Vec;
3
4use crate::input::Input;
5use crate::number_separator;
6
7/// Parses a PHP literal string, handling all escape sequences, and allocates the result in an arena.
8///
9/// # Returns
10///
11/// An `Option` containing the parsed `&'arena str` or `None` if the input is invalid.
12///
13/// # Panics
14///
15/// Panics if internal assumptions about character parsing are violated (e.g., invalid hex or octal digits
16/// after validation). This should not occur with valid PHP strings.
17pub fn parse_literal_string_in<'arena>(
18    arena: &'arena Bump,
19    s: &'arena str,
20    quote_char: Option<char>,
21    has_quote: bool,
22) -> Option<&'arena str> {
23    if s.is_empty() {
24        return Some("");
25    }
26
27    let s = if has_quote && (s.starts_with("b\"") || s.starts_with("b'") || s.starts_with("B\"") || s.starts_with("B'"))
28    {
29        &s[1..]
30    } else {
31        s
32    };
33
34    let (quote_char, content) = if let Some(quote_char) = quote_char {
35        (Some(quote_char), s)
36    } else if !has_quote {
37        (None, s)
38    } else if s.starts_with('"') && s.ends_with('"') && s.len() >= 2 {
39        (Some('"'), &s[1..s.len() - 1])
40    } else if s.starts_with('\'') && s.ends_with('\'') && s.len() >= 2 {
41        (Some('\''), &s[1..s.len() - 1])
42    } else {
43        return None;
44    };
45
46    let needs_processing = content.contains('\\') || quote_char.is_some_and(|q| content.contains(q));
47    if !needs_processing {
48        return Some(content);
49    }
50
51    let mut result = Vec::with_capacity_in(content.len(), arena);
52    let mut chars = content.chars().peekable();
53    let mut buf = [0; 4];
54
55    while let Some(c) = chars.next() {
56        if c != '\\' {
57            result.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
58            continue;
59        }
60
61        let Some(&next_char) = chars.peek() else {
62            result.push(b'\\');
63            continue;
64        };
65
66        let mut consumed = true;
67
68        match next_char {
69            '\\' => result.push(b'\\'),
70            '\'' if quote_char == Some('\'') => result.push(b'\''),
71            '"' if quote_char == Some('"') => result.push(b'"'),
72            '$' if quote_char == Some('"') => result.push(b'$'),
73            'n' if quote_char == Some('"') => result.push(b'\n'),
74            't' if quote_char == Some('"') => result.push(b'\t'),
75            'r' if quote_char == Some('"') => result.push(b'\r'),
76            'v' if quote_char == Some('"') => result.push(0x0B),
77            'e' if quote_char == Some('"') => result.push(0x1B),
78            'f' if quote_char == Some('"') => result.push(0x0C),
79            '0' if quote_char == Some('"') => result.push(0x00),
80            'x' if quote_char == Some('"') => {
81                chars.next(); // Consume 'x'
82                let mut hex_val = 0u8;
83                let mut hex_len = 0;
84                // Peek up to 2 hex digits
85                while let Some(peeked) = chars.peek() {
86                    if hex_len < 2 && peeked.is_ascii_hexdigit() {
87                        hex_val = hex_val * 16 + peeked.to_digit(16).unwrap() as u8;
88                        hex_len += 1;
89                        chars.next(); // Consume the digit
90                    } else {
91                        break;
92                    }
93                }
94                if hex_len > 0 {
95                    result.push(hex_val);
96                } else {
97                    // Invalid `\x` sequence, treat as literal `\x`
98                    result.push(b'\\');
99                    result.push(b'x');
100                }
101
102                consumed = false;
103            }
104            c if quote_char == Some('"') && c.is_ascii_digit() => {
105                let mut octal_val = 0u16;
106                let mut octal_len = 0;
107
108                while let Some(peeked) = chars.peek() {
109                    if octal_len < 3 && peeked.is_ascii_digit() && *peeked <= '7' {
110                        octal_val = octal_val * 8 + peeked.to_digit(8).unwrap() as u16;
111                        octal_len += 1;
112                        chars.next(); // Consume the digit
113                    } else {
114                        break;
115                    }
116                }
117                if octal_len > 0 {
118                    // Truncate to u8 (matches PHP behavior for octal sequences > 255)
119                    result.push(octal_val as u8);
120                } else {
121                    result.push(b'\\');
122                    result.push(b'0');
123                }
124
125                consumed = false;
126            }
127            _ => {
128                // Unrecognized escape sequence
129                if quote_char == Some('\'') {
130                    // In single quotes, only \' and \\ are special.
131                    result.push(b'\\');
132                    result.extend_from_slice(next_char.encode_utf8(&mut buf).as_bytes());
133                } else {
134                    // In double quotes, an invalid escape is just the character.
135                    result.extend_from_slice(next_char.encode_utf8(&mut buf).as_bytes());
136                }
137            }
138        }
139
140        if consumed {
141            chars.next(); // Consume the character after the backslash
142        }
143    }
144
145    std::str::from_utf8(result.into_bump_slice()).ok()
146}
147
148/// Parses a PHP literal string, handling all escape sequences, and returns the result as a `String`.
149///
150/// # Returns
151///
152/// An `Option<String>` containing the parsed string or `None` if the input is invalid.
153///
154/// # Notes
155///
156/// This function is similar to `parse_literal_string_in`, but it allocates the result on the heap instead of in an arena.
157/// It is recommended to use `parse_literal_string_in` when possible for better performance in contexts where an arena is available.
158///
159/// # Panics
160///
161/// Panics if internal assumptions about character parsing are violated (e.g., invalid hex or octal digits
162/// after validation). This should not occur with valid PHP strings.
163#[inline]
164#[must_use]
165pub fn parse_literal_string(s: &str, quote_char: Option<char>, has_quote: bool) -> Option<String> {
166    if s.is_empty() {
167        return Some(String::new());
168    }
169
170    let (quote_char, content) = if let Some(quote_char) = quote_char {
171        (Some(quote_char), s)
172    } else if !has_quote {
173        (None, s)
174    } else if s.starts_with('"') && s.ends_with('"') && s.len() >= 2 {
175        (Some('"'), &s[1..s.len() - 1])
176    } else if s.starts_with('\'') && s.ends_with('\'') && s.len() >= 2 {
177        (Some('\''), &s[1..s.len() - 1])
178    } else {
179        return None;
180    };
181
182    let mut result = String::new();
183    let mut chars = content.chars().peekable();
184
185    while let Some(c) = chars.next() {
186        if c != '\\' {
187            result.push(c);
188
189            continue;
190        }
191
192        let Some(&next_char) = chars.peek() else {
193            result.push(c);
194
195            continue;
196        };
197
198        match next_char {
199            '\\' => {
200                result.push('\\');
201                chars.next();
202            }
203            '\'' if quote_char == Some('\'') => {
204                result.push('\'');
205                chars.next();
206            }
207            '"' if quote_char == Some('"') => {
208                result.push('"');
209                chars.next();
210            }
211            'n' if quote_char == Some('"') => {
212                result.push('\n');
213                chars.next();
214            }
215            't' if quote_char == Some('"') => {
216                result.push('\t');
217                chars.next();
218            }
219            'r' if quote_char == Some('"') => {
220                result.push('\r');
221                chars.next();
222            }
223            'v' if quote_char == Some('"') => {
224                result.push('\x0B');
225                chars.next();
226            }
227            'e' if quote_char == Some('"') => {
228                result.push('\x1B');
229                chars.next();
230            }
231            'f' if quote_char == Some('"') => {
232                result.push('\x0C');
233                chars.next();
234            }
235            '0' if quote_char == Some('"') => {
236                result.push('\0');
237                chars.next();
238            }
239            'x' if quote_char == Some('"') => {
240                chars.next();
241
242                let mut hex_chars = String::new();
243                for _ in 0..2 {
244                    if let Some(&next) = chars.peek() {
245                        if next.is_ascii_hexdigit() {
246                            hex_chars.push(chars.next().unwrap());
247                        } else {
248                            break;
249                        }
250                    }
251                }
252
253                if hex_chars.is_empty() {
254                    return None;
255                }
256                match u8::from_str_radix(&hex_chars, 16) {
257                    Ok(byte_val) => result.push(byte_val as char),
258                    Err(_) => {
259                        return None;
260                    }
261                }
262            }
263            c if quote_char == Some('"') && c.is_ascii_digit() => {
264                let mut octal = String::new();
265                octal.push(chars.next().unwrap());
266
267                for _ in 0..2 {
268                    if let Some(&next) = chars.peek() {
269                        if next.is_ascii_digit() && next <= '7' {
270                            octal.push(chars.next().unwrap());
271                        } else {
272                            break;
273                        }
274                    }
275                }
276
277                result.push(u8::from_str_radix(&octal, 8).ok()? as char);
278            }
279            '$' if quote_char == Some('"') => {
280                result.push('$');
281                chars.next();
282            }
283            _ => {
284                if quote_char == Some('\'') {
285                    result.push(c);
286                    result.push(next_char);
287                    chars.next();
288                } else {
289                    result.push(c);
290                }
291            }
292        }
293    }
294
295    Some(result)
296}
297
298/// Parses a PHP literal float, handling underscore separators.
299#[inline]
300#[must_use]
301pub fn parse_literal_float(value: &str) -> Option<f64> {
302    if memchr::memchr(b'_', value.as_bytes()).is_none() {
303        return value.parse::<f64>().ok();
304    }
305
306    let mut buf = [0u8; 64];
307    let mut len = 0;
308
309    for &b in value.as_bytes() {
310        if b != b'_' {
311            if len < 64 {
312                buf[len] = b;
313                len += 1;
314            } else {
315                let source = value.replace('_', "");
316                return source.parse::<f64>().ok();
317            }
318        }
319    }
320
321    // SAFETY: We only copied ASCII bytes from a valid UTF-8 string
322    let s = unsafe { std::str::from_utf8_unchecked(&buf[..len]) };
323    s.parse::<f64>().ok()
324}
325
326/// Parses a PHP literal integer with support for binary, octal, decimal, and hex.
327///
328/// Optimized to use byte-level iteration instead of Unicode chars.
329#[inline]
330#[must_use]
331pub fn parse_literal_integer(value: &str) -> Option<u64> {
332    let bytes = value.as_bytes();
333    if bytes.is_empty() {
334        return None;
335    }
336
337    let (radix, start) = match bytes {
338        [b'0', b'x' | b'X', ..] => (16u128, 2),
339        [b'0', b'o' | b'O', ..] => (8u128, 2),
340        [b'0', b'b' | b'B', ..] => (2u128, 2),
341        [b'0', _, ..] if bytes[1..].iter().all(|&b| b == b'_' || (b'0'..=b'7').contains(&b)) => (8u128, 1), // Legacy octal
342        [b'0', _, ..] => (10u128, 0), // Invalid octal (contains 8/9), treat as decimal
343        _ => (10u128, 0),
344    };
345
346    let mut result: u128 = 0;
347    let mut has_digits = false;
348
349    for &b in &bytes[start..] {
350        if b == b'_' {
351            continue;
352        }
353
354        let digit = if b.is_ascii_digit() {
355            (b - b'0') as u128
356        } else if (b'a'..=b'f').contains(&b) {
357            (b - b'a' + 10) as u128
358        } else if (b'A'..=b'F').contains(&b) {
359            (b - b'A' + 10) as u128
360        } else {
361            return None;
362        };
363
364        if digit >= radix {
365            return None;
366        }
367
368        has_digits = true;
369
370        result = match result.checked_mul(radix) {
371            Some(r) => r,
372            None => return Some(u64::MAX),
373        };
374
375        result = match result.checked_add(digit) {
376            Some(r) => r,
377            None => return Some(u64::MAX),
378        };
379    }
380
381    if !has_digits {
382        return None;
383    }
384
385    Some(result.min(u64::MAX as u128) as u64)
386}
387
388/// Lookup table for identifier start characters (a-z, A-Z, _)
389/// Index by byte value, true if valid start of identifier
390static IS_IDENT_START: [bool; 256] = {
391    let mut table = [false; 256];
392    let mut i = 0u8;
393    loop {
394        table[i as usize] = matches!(i, b'a'..=b'z' | b'A'..=b'Z' | b'_');
395        if i == 255 {
396            break;
397        }
398        i += 1;
399    }
400
401    table
402};
403
404/// Lookup table for identifier continuation characters (a-z, A-Z, 0-9, _, or >= 0x80)
405/// Index by byte value, true if valid part of identifier
406static IS_IDENT_PART: [bool; 256] = {
407    let mut table = [false; 256];
408    let mut i = 0u8;
409    loop {
410        table[i as usize] = matches!(i, b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | 0x80..=0xFF);
411        if i == 255 {
412            break;
413        }
414        i += 1;
415    }
416    table
417};
418
419/// Check if a byte can start an identifier (a-z, A-Z, _)
420#[inline(always)]
421#[must_use]
422pub const fn is_start_of_identifier(byte: &u8) -> bool {
423    IS_IDENT_START[*byte as usize]
424}
425
426/// Check if a byte can be part of an identifier (a-z, A-Z, 0-9, _, or >= 0x80)
427#[inline(always)]
428#[must_use]
429pub const fn is_part_of_identifier(byte: &u8) -> bool {
430    IS_IDENT_PART[*byte as usize]
431}
432
433/// Scans an identifier starting at `offset` in the byte slice and returns the length.
434/// Assumes the first byte is already validated as a start of identifier.
435/// Returns the total length of the identifier (including the first byte).
436///
437/// Stops at the first byte that is not a valid identifier character.
438#[inline(always)]
439#[must_use]
440pub fn scan_identifier_length(bytes: &[u8], offset: usize) -> usize {
441    let mut len = 1;
442    let remaining = &bytes[offset + 1..];
443
444    for &b in remaining {
445        if IS_IDENT_PART[b as usize] {
446            len += 1;
447        } else {
448            break;
449        }
450    }
451
452    len
453}
454
455/// Reads a sequence of bytes representing digits in a specific numerical base.
456///
457/// This utility function iterates through the input byte slice, consuming bytes
458/// as long as they represent valid digits for the given `base`. It handles
459/// decimal digits ('0'-'9') and hexadecimal digits ('a'-'f', 'A'-'F').
460///
461/// It stops consuming at the first byte that is not a valid digit character,
462/// or is a digit character whose value is greater than or equal to the specified `base`
463/// (e.g., '8' in base 8, or 'A' in base 10).
464///
465/// This function is primarily intended as a helper for lexer implementations
466/// when tokenizing the digit part of number literals (binary, octal, decimal, hexadecimal).
467///
468/// # Arguments
469///
470/// * `input` - A byte slice starting at the potential first digit of the number.
471/// * `base` - The numerical base (e.g., 2, 8, 10, 16) to use for validating digits.
472///   Must be between 2 and 36 (inclusive) for hex characters to be potentially valid.
473///
474/// # Returns
475///
476/// The number of bytes (`usize`) consumed from the beginning of the `input` slice
477/// that constitute a valid sequence of digits for the specified `base`. Returns 0 if
478/// the first byte is not a valid digit for the base.
479#[inline]
480pub fn read_digits_of_base(input: &Input, offset: usize, base: u8) -> usize {
481    if base == 16 {
482        read_digits_with(input, offset, u8::is_ascii_hexdigit)
483    } else {
484        let max = b'0' + base;
485
486        read_digits_with(input, offset, |b| b >= &b'0' && b < &max)
487    }
488}
489
490#[inline]
491fn read_digits_with<F: Fn(&u8) -> bool>(input: &Input, offset: usize, is_digit: F) -> usize {
492    let bytes = input.bytes;
493    let total = input.length;
494    let start = input.offset;
495    let mut pos = start + offset; // Compute the absolute position.
496
497    while pos < total {
498        let current = bytes[pos];
499        if is_digit(&current) {
500            pos += 1;
501        } else if pos + 1 < total && bytes[pos] == number_separator!() && is_digit(&bytes[pos + 1]) {
502            pos += 2; // Skip the separator and the digit.
503        } else {
504            break;
505        }
506    }
507
508    // Return the relative length from the start of the current position.
509    pos - start
510}
511
512#[cfg(test)]
513mod tests {
514    use super::*;
515
516    macro_rules! parse_int {
517        ($input:expr, $expected:expr) => {
518            assert_eq!(parse_literal_integer($input), $expected);
519        };
520    }
521
522    #[test]
523    fn test_parse_literal_integer() {
524        parse_int!("123", Some(123));
525        parse_int!("0", Some(0));
526        parse_int!("0b1010", Some(10));
527        parse_int!("0o17", Some(15));
528        parse_int!("0x1A3F", Some(6719));
529        parse_int!("0XFF", Some(255));
530        parse_int!("0_1_2_3", Some(83));
531        parse_int!("0b1_0_1_0", Some(10));
532        parse_int!("0o1_7", Some(15));
533        parse_int!("0x1_A_3_F", Some(6719));
534        parse_int!("", None);
535        parse_int!("0xGHI", None);
536        parse_int!("0b102", None);
537        parse_int!("0o89", None);
538    }
539}