Skip to main content

mago_syntax_core/
utils.rs

1use bumpalo::Bump;
2use bumpalo::collections::Vec;
3
4use crate::input::Input;
5use crate::number_separator;
6
7/// Parses a PHP literal string, handling all escape sequences, and allocates the result in an arena.
8///
9/// # Returns
10///
11/// An `Option` containing the parsed `&'arena str` or `None` if the input is invalid.
12///
13/// # Panics
14///
15/// Panics if internal assumptions about character parsing are violated (e.g., invalid hex or octal digits
16/// after validation). This should not occur with valid PHP strings.
17pub fn parse_literal_string_in<'arena>(
18    arena: &'arena Bump,
19    s: &'arena str,
20    quote_char: Option<char>,
21    has_quote: bool,
22) -> Option<&'arena str> {
23    if s.is_empty() {
24        return Some("");
25    }
26
27    let s = if has_quote && (s.starts_with("b\"") || s.starts_with("b'") || s.starts_with("B\"") || s.starts_with("B'"))
28    {
29        &s[1..]
30    } else {
31        s
32    };
33
34    let (quote_char, content) = if let Some(quote_char) = quote_char {
35        (Some(quote_char), s)
36    } else if !has_quote {
37        (None, s)
38    } else if s.starts_with('"') && s.ends_with('"') && s.len() >= 2 {
39        (Some('"'), &s[1..s.len() - 1])
40    } else if s.starts_with('\'') && s.ends_with('\'') && s.len() >= 2 {
41        (Some('\''), &s[1..s.len() - 1])
42    } else {
43        return None;
44    };
45
46    let needs_processing = content.contains('\\') || quote_char.is_some_and(|q| content.contains(q));
47    if !needs_processing {
48        return Some(content);
49    }
50
51    let mut result = Vec::with_capacity_in(content.len(), arena);
52    let mut chars = content.chars().peekable();
53    let mut buf = [0; 4];
54
55    while let Some(c) = chars.next() {
56        if c != '\\' {
57            result.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
58            continue;
59        }
60
61        let Some(&next_char) = chars.peek() else {
62            result.push(b'\\');
63            continue;
64        };
65
66        let mut consumed = true;
67
68        match next_char {
69            '\\' => result.push(b'\\'),
70            '\'' if quote_char == Some('\'') => result.push(b'\''),
71            '"' if quote_char == Some('"') => result.push(b'"'),
72            '$' if quote_char == Some('"') => result.push(b'$'),
73            'n' if quote_char == Some('"') => result.push(b'\n'),
74            't' if quote_char == Some('"') => result.push(b'\t'),
75            'r' if quote_char == Some('"') => result.push(b'\r'),
76            'v' if quote_char == Some('"') => result.push(0x0B),
77            'e' if quote_char == Some('"') => result.push(0x1B),
78            'f' if quote_char == Some('"') => result.push(0x0C),
79            'x' if quote_char == Some('"') => {
80                chars.next(); // Consume 'x'
81                let mut hex_val = 0u8;
82                let mut hex_len = 0;
83                // Peek up to 2 hex digits
84                while let Some(peeked) = chars.peek() {
85                    if hex_len < 2
86                        && peeked.is_ascii_hexdigit()
87                        && let Some(digit) = peeked.to_digit(16)
88                    {
89                        hex_val = hex_val * 16 + digit as u8;
90                        hex_len += 1;
91                        chars.next(); // Consume the digit
92                    } else {
93                        break;
94                    }
95                }
96                if hex_len > 0 {
97                    result.push(hex_val);
98                } else {
99                    // Invalid `\x` sequence, treat as literal `\x`
100                    result.push(b'\\');
101                    result.push(b'x');
102                }
103
104                consumed = false;
105            }
106            c if quote_char == Some('"') && c.is_ascii_digit() => {
107                let mut octal_val = 0u16;
108                let mut octal_len = 0;
109
110                while let Some(peeked) = chars.peek() {
111                    if octal_len < 3
112                        && peeked.is_ascii_digit()
113                        && *peeked <= '7'
114                        && let Some(digit) = peeked.to_digit(8)
115                    {
116                        octal_val = octal_val * 8 + digit as u16;
117                        octal_len += 1;
118                        chars.next(); // Consume the digit
119                    } else {
120                        break;
121                    }
122                }
123                if octal_len > 0 {
124                    // Truncate to u8 (matches PHP behavior for octal sequences > 255)
125                    result.push(octal_val as u8);
126                } else {
127                    result.push(b'\\');
128                    result.extend_from_slice(next_char.encode_utf8(&mut buf).as_bytes());
129                    chars.next();
130                }
131
132                consumed = false;
133            }
134            _ => {
135                // Unrecognized escape sequence
136                result.push(b'\\');
137                result.extend_from_slice(next_char.encode_utf8(&mut buf).as_bytes());
138            }
139        }
140
141        if consumed {
142            chars.next(); // Consume the character after the backslash
143        }
144    }
145
146    std::str::from_utf8(result.into_bump_slice()).ok()
147}
148
149/// Parses a PHP literal string, handling all escape sequences, and returns the result as a `String`.
150///
151/// # Returns
152///
153/// An `Option<String>` containing the parsed string or `None` if the input is invalid.
154///
155/// # Notes
156///
157/// This function is similar to `parse_literal_string_in`, but it allocates the result on the heap instead of in an arena.
158/// It is recommended to use `parse_literal_string_in` when possible for better performance in contexts where an arena is available.
159///
160/// # Panics
161///
162/// Panics if internal assumptions about character parsing are violated (e.g., invalid hex or octal digits
163/// after validation). This should not occur with valid PHP strings.
164#[inline]
165#[must_use]
166pub fn parse_literal_string(s: &str, quote_char: Option<char>, has_quote: bool) -> Option<String> {
167    if s.is_empty() {
168        return Some(String::new());
169    }
170
171    let (quote_char, content) = if let Some(quote_char) = quote_char {
172        (Some(quote_char), s)
173    } else if !has_quote {
174        (None, s)
175    } else if s.starts_with('"') && s.ends_with('"') && s.len() >= 2 {
176        (Some('"'), &s[1..s.len() - 1])
177    } else if s.starts_with('\'') && s.ends_with('\'') && s.len() >= 2 {
178        (Some('\''), &s[1..s.len() - 1])
179    } else {
180        return None;
181    };
182
183    let mut result = String::new();
184    let mut chars = content.chars().peekable();
185
186    while let Some(c) = chars.next() {
187        if c != '\\' {
188            result.push(c);
189
190            continue;
191        }
192
193        let Some(&next_char) = chars.peek() else {
194            result.push(c);
195
196            continue;
197        };
198
199        match next_char {
200            '\\' => {
201                result.push('\\');
202                chars.next();
203            }
204            '\'' if quote_char == Some('\'') => {
205                result.push('\'');
206                chars.next();
207            }
208            '"' if quote_char == Some('"') => {
209                result.push('"');
210                chars.next();
211            }
212            'n' if quote_char == Some('"') => {
213                result.push('\n');
214                chars.next();
215            }
216            't' if quote_char == Some('"') => {
217                result.push('\t');
218                chars.next();
219            }
220            'r' if quote_char == Some('"') => {
221                result.push('\r');
222                chars.next();
223            }
224            'v' if quote_char == Some('"') => {
225                result.push('\x0B');
226                chars.next();
227            }
228            'e' if quote_char == Some('"') => {
229                result.push('\x1B');
230                chars.next();
231            }
232            'f' if quote_char == Some('"') => {
233                result.push('\x0C');
234                chars.next();
235            }
236            'x' if quote_char == Some('"') => {
237                chars.next();
238
239                let mut hex_chars = String::new();
240                for _ in 0..2 {
241                    if let Some(&next) = chars.peek() {
242                        if next.is_ascii_hexdigit() {
243                            if let Some(c) = chars.next() {
244                                hex_chars.push(c);
245                            }
246                        } else {
247                            break;
248                        }
249                    }
250                }
251
252                if hex_chars.is_empty() {
253                    return None;
254                }
255                match u8::from_str_radix(&hex_chars, 16) {
256                    Ok(byte_val) => result.push(byte_val as char),
257                    Err(_) => {
258                        return None;
259                    }
260                }
261            }
262            c if quote_char == Some('"') && c.is_ascii_digit() => {
263                let mut octal = String::new();
264                if let Some(first) = chars.next() {
265                    octal.push(first);
266                }
267
268                for _ in 0..2 {
269                    if let Some(&next) = chars.peek() {
270                        if next.is_ascii_digit() && next <= '7' {
271                            if let Some(c) = chars.next() {
272                                octal.push(c);
273                            }
274                        } else {
275                            break;
276                        }
277                    }
278                }
279
280                match u8::from_str_radix(&octal, 8) {
281                    Ok(val) => result.push(val as char),
282                    Err(_) => {
283                        result.push('\\');
284                        result.push_str(&octal);
285                    }
286                }
287            }
288            '$' if quote_char == Some('"') => {
289                result.push('$');
290                chars.next();
291            }
292            _ => {
293                result.push(c);
294                result.push(next_char);
295                chars.next();
296            }
297        }
298    }
299
300    Some(result)
301}
302
303/// Parses a PHP literal float, handling underscore separators.
304#[inline]
305#[must_use]
306pub fn parse_literal_float(value: &str) -> Option<f64> {
307    if memchr::memchr(b'_', value.as_bytes()).is_none() {
308        return value.parse::<f64>().ok();
309    }
310
311    let mut buf = [0u8; 64];
312    let mut len = 0;
313
314    for &b in value.as_bytes() {
315        if b != b'_' {
316            if len < 64 {
317                buf[len] = b;
318                len += 1;
319            } else {
320                let source = value.replace('_', "");
321                return source.parse::<f64>().ok();
322            }
323        }
324    }
325
326    // SAFETY: We only copied ASCII bytes from a valid UTF-8 string
327    let s = unsafe { std::str::from_utf8_unchecked(&buf[..len]) };
328    s.parse::<f64>().ok()
329}
330
331/// Parses a PHP literal integer with support for binary, octal, decimal, and hex.
332///
333/// Optimized to use byte-level iteration instead of Unicode chars.
334#[inline]
335#[must_use]
336pub fn parse_literal_integer(value: &str) -> Option<u64> {
337    let bytes = value.as_bytes();
338    if bytes.is_empty() {
339        return None;
340    }
341
342    let (radix, start) = match bytes {
343        [b'0', b'x' | b'X', ..] => (16u128, 2),
344        [b'0', b'o' | b'O', ..] => (8u128, 2),
345        [b'0', b'b' | b'B', ..] => (2u128, 2),
346        [b'0', _, ..] if bytes[1..].iter().all(|&b| b == b'_' || (b'0'..=b'7').contains(&b)) => (8u128, 1), // Legacy octal
347        [b'0', _, ..] => (10u128, 0), // Invalid octal (contains 8/9), treat as decimal
348        _ => (10u128, 0),
349    };
350
351    let mut result: u128 = 0;
352    let mut has_digits = false;
353
354    for &b in &bytes[start..] {
355        if b == b'_' {
356            continue;
357        }
358
359        let digit = if b.is_ascii_digit() {
360            (b - b'0') as u128
361        } else if (b'a'..=b'f').contains(&b) {
362            (b - b'a' + 10) as u128
363        } else if (b'A'..=b'F').contains(&b) {
364            (b - b'A' + 10) as u128
365        } else {
366            return None;
367        };
368
369        if digit >= radix {
370            return None;
371        }
372
373        has_digits = true;
374
375        result = match result.checked_mul(radix) {
376            Some(r) => r,
377            None => return Some(u64::MAX),
378        };
379
380        result = match result.checked_add(digit) {
381            Some(r) => r,
382            None => return Some(u64::MAX),
383        };
384    }
385
386    if !has_digits {
387        return None;
388    }
389
390    Some(result.min(u64::MAX as u128) as u64)
391}
392
393/// Lookup table for identifier start characters (a-z, A-Z, _)
394/// Index by byte value, true if valid start of identifier
395static IS_IDENT_START: [bool; 256] = {
396    let mut table = [false; 256];
397    let mut i = 0u8;
398    loop {
399        table[i as usize] = matches!(i, b'a'..=b'z' | b'A'..=b'Z' | b'_');
400        if i == 255 {
401            break;
402        }
403        i += 1;
404    }
405
406    table
407};
408
409/// Lookup table for identifier continuation characters (a-z, A-Z, 0-9, _, or >= 0x80)
410/// Index by byte value, true if valid part of identifier
411static IS_IDENT_PART: [bool; 256] = {
412    let mut table = [false; 256];
413    let mut i = 0u8;
414    loop {
415        table[i as usize] = matches!(i, b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | 0x80..=0xFF);
416        if i == 255 {
417            break;
418        }
419        i += 1;
420    }
421    table
422};
423
424/// Check if a byte can start an identifier (a-z, A-Z, _)
425#[inline(always)]
426#[must_use]
427pub const fn is_start_of_identifier(byte: &u8) -> bool {
428    IS_IDENT_START[*byte as usize]
429}
430
431/// Check if a byte can be part of an identifier (a-z, A-Z, 0-9, _, or >= 0x80)
432#[inline(always)]
433#[must_use]
434pub const fn is_part_of_identifier(byte: &u8) -> bool {
435    IS_IDENT_PART[*byte as usize]
436}
437
438/// Scans an identifier starting at `offset` in the byte slice and returns the length.
439///
440/// Assumes the first byte is already validated as a start of identifier.
441/// Returns the total length of the identifier (including the first byte).
442/// Stops at the first byte that is not a valid identifier character.
443#[inline(always)]
444#[must_use]
445pub fn scan_identifier_length(bytes: &[u8], offset: usize) -> usize {
446    let mut len = 1;
447    let remaining = &bytes[offset + 1..];
448
449    for &b in remaining {
450        if IS_IDENT_PART[b as usize] {
451            len += 1;
452        } else {
453            break;
454        }
455    }
456
457    len
458}
459
460/// Reads a sequence of bytes representing digits in a specific numerical base.
461///
462/// This utility function iterates through the input byte slice, consuming bytes
463/// as long as they represent valid digits for the given `base`. It handles
464/// decimal digits ('0'-'9') and hexadecimal digits ('a'-'f', 'A'-'F').
465///
466/// It stops consuming at the first byte that is not a valid digit character,
467/// or is a digit character whose value is greater than or equal to the specified `base`
468/// (e.g., '8' in base 8, or 'A' in base 10).
469///
470/// This function is primarily intended as a helper for lexer implementations
471/// when tokenizing the digit part of number literals (binary, octal, decimal, hexadecimal).
472///
473/// # Arguments
474///
475/// * `input` - A byte slice starting at the potential first digit of the number.
476/// * `base` - The numerical base (e.g., 2, 8, 10, 16) to use for validating digits.
477///   Must be between 2 and 36 (inclusive) for hex characters to be potentially valid.
478///
479/// # Returns
480///
481/// The number of bytes (`usize`) consumed from the beginning of the `input` slice
482/// that constitute a valid sequence of digits for the specified `base`. Returns 0 if
483/// the first byte is not a valid digit for the base.
484#[inline]
485pub fn read_digits_of_base(input: &Input, offset: usize, base: u8) -> usize {
486    if base == 16 {
487        read_digits_with(input, offset, u8::is_ascii_hexdigit)
488    } else {
489        let max = b'0' + base;
490
491        read_digits_with(input, offset, |b| b >= &b'0' && b < &max)
492    }
493}
494
495#[inline]
496fn read_digits_with<F: Fn(&u8) -> bool>(input: &Input, offset: usize, is_digit: F) -> usize {
497    let bytes = input.bytes;
498    let total = input.length;
499    let start = input.offset;
500    let mut pos = start + offset; // Compute the absolute position.
501
502    while pos < total {
503        let current = bytes[pos];
504        if is_digit(&current) {
505            pos += 1;
506        } else if pos + 1 < total && bytes[pos] == number_separator!() && is_digit(&bytes[pos + 1]) {
507            pos += 2; // Skip the separator and the digit.
508        } else {
509            break;
510        }
511    }
512
513    // Return the relative length from the start of the current position.
514    pos - start
515}
516
517#[cfg(test)]
518mod tests {
519    use super::*;
520
521    macro_rules! parse_int {
522        ($input:expr, $expected:expr) => {
523            assert_eq!(parse_literal_integer($input), $expected);
524        };
525    }
526
527    #[test]
528    fn test_parse_literal_integer() {
529        parse_int!("123", Some(123));
530        parse_int!("0", Some(0));
531        parse_int!("0b1010", Some(10));
532        parse_int!("0o17", Some(15));
533        parse_int!("0x1A3F", Some(6719));
534        parse_int!("0XFF", Some(255));
535        parse_int!("0_1_2_3", Some(83));
536        parse_int!("0b1_0_1_0", Some(10));
537        parse_int!("0o1_7", Some(15));
538        parse_int!("0x1_A_3_F", Some(6719));
539        parse_int!("", None);
540        parse_int!("0xGHI", None);
541        parse_int!("0b102", None);
542        parse_int!("0o89", None);
543    }
544}