Skip to main content

mago_syntax_core/
utils.rs

1use bumpalo::Bump;
2use bumpalo::collections::Vec;
3
4use crate::input::Input;
5use crate::number_separator;
6
7/// Parses a PHP literal string, handling all escape sequences, and allocates the result in an arena.
8///
9/// # Returns
10///
11/// An `Option` containing the parsed `&'arena str` or `None` if the input is invalid.
12///
13/// # Panics
14///
15/// Panics if internal assumptions about character parsing are violated (e.g., invalid hex or octal digits
16/// after validation). This should not occur with valid PHP strings.
17pub fn parse_literal_string_in<'arena>(
18    arena: &'arena Bump,
19    s: &'arena str,
20    quote_char: Option<char>,
21    has_quote: bool,
22) -> Option<&'arena str> {
23    if s.is_empty() {
24        return Some("");
25    }
26
27    let (quote_char, content) = if let Some(quote_char) = quote_char {
28        (Some(quote_char), s)
29    } else if !has_quote {
30        (None, s)
31    } else if s.starts_with('"') && s.ends_with('"') && s.len() >= 2 {
32        (Some('"'), &s[1..s.len() - 1])
33    } else if s.starts_with('\'') && s.ends_with('\'') && s.len() >= 2 {
34        (Some('\''), &s[1..s.len() - 1])
35    } else {
36        return None;
37    };
38
39    let needs_processing = content.contains('\\') || quote_char.is_some_and(|q| content.contains(q));
40    if !needs_processing {
41        return Some(content);
42    }
43
44    let mut result = Vec::with_capacity_in(content.len(), arena);
45    let mut chars = content.chars().peekable();
46    let mut buf = [0; 4];
47
48    while let Some(c) = chars.next() {
49        if c != '\\' {
50            result.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
51            continue;
52        }
53
54        let Some(&next_char) = chars.peek() else {
55            result.push(b'\\');
56            continue;
57        };
58
59        let mut consumed = true;
60
61        match next_char {
62            '\\' => result.push(b'\\'),
63            '\'' if quote_char == Some('\'') => result.push(b'\''),
64            '"' if quote_char == Some('"') => result.push(b'"'),
65            '$' if quote_char == Some('"') => result.push(b'$'),
66            'n' if quote_char == Some('"') => result.push(b'\n'),
67            't' if quote_char == Some('"') => result.push(b'\t'),
68            'r' if quote_char == Some('"') => result.push(b'\r'),
69            'v' if quote_char == Some('"') => result.push(0x0B),
70            'e' if quote_char == Some('"') => result.push(0x1B),
71            'f' if quote_char == Some('"') => result.push(0x0C),
72            '0' if quote_char == Some('"') => result.push(0x00),
73            'x' if quote_char == Some('"') => {
74                chars.next(); // Consume 'x'
75                let mut hex_val = 0u8;
76                let mut hex_len = 0;
77                // Peek up to 2 hex digits
78                while let Some(peeked) = chars.peek() {
79                    if hex_len < 2 && peeked.is_ascii_hexdigit() {
80                        hex_val = hex_val * 16 + peeked.to_digit(16).unwrap() as u8;
81                        hex_len += 1;
82                        chars.next(); // Consume the digit
83                    } else {
84                        break;
85                    }
86                }
87                if hex_len > 0 {
88                    result.push(hex_val);
89                } else {
90                    // Invalid `\x` sequence, treat as literal `\x`
91                    result.push(b'\\');
92                    result.push(b'x');
93                }
94
95                consumed = false;
96            }
97            c if quote_char == Some('"') && c.is_ascii_digit() => {
98                let mut octal_val = 0u16;
99                let mut octal_len = 0;
100
101                while let Some(peeked) = chars.peek() {
102                    if octal_len < 3 && peeked.is_ascii_digit() && *peeked <= '7' {
103                        octal_val = octal_val * 8 + peeked.to_digit(8).unwrap() as u16;
104                        octal_len += 1;
105                        chars.next(); // Consume the digit
106                    } else {
107                        break;
108                    }
109                }
110                if octal_len > 0 {
111                    // Truncate to u8 (matches PHP behavior for octal sequences > 255)
112                    result.push(octal_val as u8);
113                } else {
114                    result.push(b'\\');
115                    result.push(b'0');
116                }
117
118                consumed = false;
119            }
120            _ => {
121                // Unrecognized escape sequence
122                if quote_char == Some('\'') {
123                    // In single quotes, only \' and \\ are special.
124                    result.push(b'\\');
125                    result.extend_from_slice(next_char.encode_utf8(&mut buf).as_bytes());
126                } else {
127                    // In double quotes, an invalid escape is just the character.
128                    result.extend_from_slice(next_char.encode_utf8(&mut buf).as_bytes());
129                }
130            }
131        }
132
133        if consumed {
134            chars.next(); // Consume the character after the backslash
135        }
136    }
137
138    std::str::from_utf8(result.into_bump_slice()).ok()
139}
140
141/// Parses a PHP literal string, handling all escape sequences, and returns the result as a `String`.
142///
143/// # Returns
144///
145/// An `Option<String>` containing the parsed string or `None` if the input is invalid.
146///
147/// # Notes
148///
149/// This function is similar to `parse_literal_string_in`, but it allocates the result on the heap instead of in an arena.
150/// It is recommended to use `parse_literal_string_in` when possible for better performance in contexts where an arena is available.
151///
152/// # Panics
153///
154/// Panics if internal assumptions about character parsing are violated (e.g., invalid hex or octal digits
155/// after validation). This should not occur with valid PHP strings.
156#[inline]
157#[must_use]
158pub fn parse_literal_string(s: &str, quote_char: Option<char>, has_quote: bool) -> Option<String> {
159    if s.is_empty() {
160        return Some(String::new());
161    }
162
163    let (quote_char, content) = if let Some(quote_char) = quote_char {
164        (Some(quote_char), s)
165    } else if !has_quote {
166        (None, s)
167    } else if s.starts_with('"') && s.ends_with('"') && s.len() >= 2 {
168        (Some('"'), &s[1..s.len() - 1])
169    } else if s.starts_with('\'') && s.ends_with('\'') && s.len() >= 2 {
170        (Some('\''), &s[1..s.len() - 1])
171    } else {
172        return None;
173    };
174
175    let mut result = String::new();
176    let mut chars = content.chars().peekable();
177
178    while let Some(c) = chars.next() {
179        if c != '\\' {
180            result.push(c);
181
182            continue;
183        }
184
185        let Some(&next_char) = chars.peek() else {
186            result.push(c);
187
188            continue;
189        };
190
191        match next_char {
192            '\\' => {
193                result.push('\\');
194                chars.next();
195            }
196            '\'' if quote_char == Some('\'') => {
197                result.push('\'');
198                chars.next();
199            }
200            '"' if quote_char == Some('"') => {
201                result.push('"');
202                chars.next();
203            }
204            'n' if quote_char == Some('"') => {
205                result.push('\n');
206                chars.next();
207            }
208            't' if quote_char == Some('"') => {
209                result.push('\t');
210                chars.next();
211            }
212            'r' if quote_char == Some('"') => {
213                result.push('\r');
214                chars.next();
215            }
216            'v' if quote_char == Some('"') => {
217                result.push('\x0B');
218                chars.next();
219            }
220            'e' if quote_char == Some('"') => {
221                result.push('\x1B');
222                chars.next();
223            }
224            'f' if quote_char == Some('"') => {
225                result.push('\x0C');
226                chars.next();
227            }
228            '0' if quote_char == Some('"') => {
229                result.push('\0');
230                chars.next();
231            }
232            'x' if quote_char == Some('"') => {
233                chars.next();
234
235                let mut hex_chars = String::new();
236                for _ in 0..2 {
237                    if let Some(&next) = chars.peek() {
238                        if next.is_ascii_hexdigit() {
239                            hex_chars.push(chars.next().unwrap());
240                        } else {
241                            break;
242                        }
243                    }
244                }
245
246                if hex_chars.is_empty() {
247                    return None;
248                }
249                match u8::from_str_radix(&hex_chars, 16) {
250                    Ok(byte_val) => result.push(byte_val as char),
251                    Err(_) => {
252                        return None;
253                    }
254                }
255            }
256            c if quote_char == Some('"') && c.is_ascii_digit() => {
257                let mut octal = String::new();
258                octal.push(chars.next().unwrap());
259
260                for _ in 0..2 {
261                    if let Some(&next) = chars.peek() {
262                        if next.is_ascii_digit() && next <= '7' {
263                            octal.push(chars.next().unwrap());
264                        } else {
265                            break;
266                        }
267                    }
268                }
269
270                result.push(u8::from_str_radix(&octal, 8).ok()? as char);
271            }
272            '$' if quote_char == Some('"') => {
273                result.push('$');
274                chars.next();
275            }
276            _ => {
277                if quote_char == Some('\'') {
278                    result.push(c);
279                    result.push(next_char);
280                    chars.next();
281                } else {
282                    result.push(c);
283                }
284            }
285        }
286    }
287
288    Some(result)
289}
290
291/// Parses a PHP literal float, handling underscore separators.
292#[inline]
293#[must_use]
294pub fn parse_literal_float(value: &str) -> Option<f64> {
295    if memchr::memchr(b'_', value.as_bytes()).is_none() {
296        return value.parse::<f64>().ok();
297    }
298
299    let mut buf = [0u8; 64];
300    let mut len = 0;
301
302    for &b in value.as_bytes() {
303        if b != b'_' {
304            if len < 64 {
305                buf[len] = b;
306                len += 1;
307            } else {
308                let source = value.replace('_', "");
309                return source.parse::<f64>().ok();
310            }
311        }
312    }
313
314    // SAFETY: We only copied ASCII bytes from a valid UTF-8 string
315    let s = unsafe { std::str::from_utf8_unchecked(&buf[..len]) };
316    s.parse::<f64>().ok()
317}
318
319/// Parses a PHP literal integer with support for binary, octal, decimal, and hex.
320///
321/// Optimized to use byte-level iteration instead of Unicode chars.
322#[inline]
323#[must_use]
324pub fn parse_literal_integer(value: &str) -> Option<u64> {
325    let bytes = value.as_bytes();
326    if bytes.is_empty() {
327        return None;
328    }
329
330    let (radix, start) = match bytes {
331        [b'0', b'x' | b'X', ..] => (16u128, 2),
332        [b'0', b'o' | b'O', ..] => (8u128, 2),
333        [b'0', b'b' | b'B', ..] => (2u128, 2),
334        [b'0', _, ..] => (8u128, 1), // Legacy octal
335        _ => (10u128, 0),
336    };
337
338    let mut result: u128 = 0;
339    let mut has_digits = false;
340
341    for &b in &bytes[start..] {
342        if b == b'_' {
343            continue;
344        }
345
346        let digit = if b.is_ascii_digit() {
347            (b - b'0') as u128
348        } else if (b'a'..=b'f').contains(&b) {
349            (b - b'a' + 10) as u128
350        } else if (b'A'..=b'F').contains(&b) {
351            (b - b'A' + 10) as u128
352        } else {
353            return None;
354        };
355
356        if digit >= radix {
357            return None;
358        }
359
360        has_digits = true;
361
362        result = match result.checked_mul(radix) {
363            Some(r) => r,
364            None => return Some(u64::MAX),
365        };
366
367        result = match result.checked_add(digit) {
368            Some(r) => r,
369            None => return Some(u64::MAX),
370        };
371    }
372
373    if !has_digits {
374        return None;
375    }
376
377    Some(result.min(u64::MAX as u128) as u64)
378}
379
380/// Lookup table for identifier start characters (a-z, A-Z, _)
381/// Index by byte value, true if valid start of identifier
382static IS_IDENT_START: [bool; 256] = {
383    let mut table = [false; 256];
384    let mut i = 0u8;
385    loop {
386        table[i as usize] = matches!(i, b'a'..=b'z' | b'A'..=b'Z' | b'_');
387        if i == 255 {
388            break;
389        }
390        i += 1;
391    }
392
393    table
394};
395
396/// Lookup table for identifier continuation characters (a-z, A-Z, 0-9, _, or >= 0x80)
397/// Index by byte value, true if valid part of identifier
398static IS_IDENT_PART: [bool; 256] = {
399    let mut table = [false; 256];
400    let mut i = 0u8;
401    loop {
402        table[i as usize] = matches!(i, b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | 0x80..=0xFF);
403        if i == 255 {
404            break;
405        }
406        i += 1;
407    }
408    table
409};
410
411/// Check if a byte can start an identifier (a-z, A-Z, _)
412#[inline(always)]
413#[must_use]
414pub const fn is_start_of_identifier(byte: &u8) -> bool {
415    IS_IDENT_START[*byte as usize]
416}
417
418/// Check if a byte can be part of an identifier (a-z, A-Z, 0-9, _, or >= 0x80)
419#[inline(always)]
420#[must_use]
421pub const fn is_part_of_identifier(byte: &u8) -> bool {
422    IS_IDENT_PART[*byte as usize]
423}
424
425/// Scans an identifier starting at `offset` in the byte slice and returns the length.
426/// Assumes the first byte is already validated as a start of identifier.
427/// Returns the total length of the identifier (including the first byte).
428///
429/// Stops at the first byte that is not a valid identifier character.
430#[inline(always)]
431#[must_use]
432pub fn scan_identifier_length(bytes: &[u8], offset: usize) -> usize {
433    let mut len = 1;
434    let remaining = &bytes[offset + 1..];
435
436    for &b in remaining {
437        if IS_IDENT_PART[b as usize] {
438            len += 1;
439        } else {
440            break;
441        }
442    }
443
444    len
445}
446
447/// Reads a sequence of bytes representing digits in a specific numerical base.
448///
449/// This utility function iterates through the input byte slice, consuming bytes
450/// as long as they represent valid digits for the given `base`. It handles
451/// decimal digits ('0'-'9') and hexadecimal digits ('a'-'f', 'A'-'F').
452///
453/// It stops consuming at the first byte that is not a valid digit character,
454/// or is a digit character whose value is greater than or equal to the specified `base`
455/// (e.g., '8' in base 8, or 'A' in base 10).
456///
457/// This function is primarily intended as a helper for lexer implementations
458/// when tokenizing the digit part of number literals (binary, octal, decimal, hexadecimal).
459///
460/// # Arguments
461///
462/// * `input` - A byte slice starting at the potential first digit of the number.
463/// * `base` - The numerical base (e.g., 2, 8, 10, 16) to use for validating digits.
464///   Must be between 2 and 36 (inclusive) for hex characters to be potentially valid.
465///
466/// # Returns
467///
468/// The number of bytes (`usize`) consumed from the beginning of the `input` slice
469/// that constitute a valid sequence of digits for the specified `base`. Returns 0 if
470/// the first byte is not a valid digit for the base.
471#[inline]
472pub fn read_digits_of_base(input: &Input, offset: usize, base: u8) -> usize {
473    if base == 16 {
474        read_digits_with(input, offset, u8::is_ascii_hexdigit)
475    } else {
476        let max = b'0' + base;
477
478        read_digits_with(input, offset, |b| b >= &b'0' && b < &max)
479    }
480}
481
482#[inline]
483fn read_digits_with<F: Fn(&u8) -> bool>(input: &Input, offset: usize, is_digit: F) -> usize {
484    let bytes = input.bytes;
485    let total = input.length;
486    let start = input.offset;
487    let mut pos = start + offset; // Compute the absolute position.
488
489    while pos < total {
490        let current = bytes[pos];
491        if is_digit(&current) {
492            pos += 1;
493        } else if pos + 1 < total && bytes[pos] == number_separator!() && is_digit(&bytes[pos + 1]) {
494            pos += 2; // Skip the separator and the digit.
495        } else {
496            break;
497        }
498    }
499
500    // Return the relative length from the start of the current position.
501    pos - start
502}
503
504#[cfg(test)]
505mod tests {
506    use super::*;
507
508    macro_rules! parse_int {
509        ($input:expr, $expected:expr) => {
510            assert_eq!(parse_literal_integer($input), $expected);
511        };
512    }
513
514    #[test]
515    fn test_parse_literal_integer() {
516        parse_int!("123", Some(123));
517        parse_int!("0", Some(0));
518        parse_int!("0b1010", Some(10));
519        parse_int!("0o17", Some(15));
520        parse_int!("0x1A3F", Some(6719));
521        parse_int!("0XFF", Some(255));
522        parse_int!("0_1_2_3", Some(83));
523        parse_int!("0b1_0_1_0", Some(10));
524        parse_int!("0o1_7", Some(15));
525        parse_int!("0x1_A_3_F", Some(6719));
526        parse_int!("", None);
527        parse_int!("0xGHI", None);
528        parse_int!("0b102", None);
529        parse_int!("0o89", None);
530    }
531}