Skip to main content

mago_syntax_core/
utils.rs

1use bumpalo::Bump;
2use bumpalo::collections::Vec;
3
4use crate::input::Input;
5use crate::number_separator;
6
7/// Parses a PHP literal string, handling all escape sequences, and allocates the result in an arena.
8///
9/// # Returns
10///
11/// An `Option` containing the parsed `&'arena str` or `None` if the input is invalid.
12///
13/// # Panics
14///
15/// Panics if internal assumptions about character parsing are violated (e.g., invalid hex or octal digits
16/// after validation). This should not occur with valid PHP strings.
17pub fn parse_literal_string_in<'arena>(
18    arena: &'arena Bump,
19    s: &'arena [u8],
20    quote_char: Option<u8>,
21    has_quote: bool,
22) -> Option<&'arena [u8]> {
23    if s.is_empty() {
24        return Some(b"");
25    }
26
27    let s = if has_quote
28        && (s.starts_with(b"b\"") || s.starts_with(b"b'") || s.starts_with(b"B\"") || s.starts_with(b"B'"))
29    {
30        &s[1..]
31    } else {
32        s
33    };
34
35    let (quote_char, content) = if let Some(quote_char) = quote_char {
36        (Some(quote_char), s)
37    } else if !has_quote {
38        (None, s)
39    } else if s.starts_with(b"\"") && s.ends_with(b"\"") && s.len() >= 2 {
40        (Some(b'"'), &s[1..s.len() - 1])
41    } else if s.starts_with(b"'") && s.ends_with(b"'") && s.len() >= 2 {
42        (Some(b'\''), &s[1..s.len() - 1])
43    } else {
44        return None;
45    };
46
47    let needs_processing = content.contains(&b'\\') || quote_char.is_some_and(|q| content.contains(&q));
48    if !needs_processing {
49        return Some(content);
50    }
51
52    let mut result = Vec::with_capacity_in(content.len(), arena);
53    let mut i = 0;
54
55    while i < content.len() {
56        let b = content[i];
57        if b != b'\\' {
58            result.push(b);
59            i += 1;
60            continue;
61        }
62
63        let next_index = i + 1;
64        let Some(&next) = content.get(next_index) else {
65            result.push(b'\\');
66            i += 1;
67            continue;
68        };
69
70        // Most escapes consume two bytes (`\` + the next byte). The hex and octal
71        // forms scan additional digit bytes and update `i` themselves.
72        let mut consumed = 2;
73
74        match next {
75            b'\\' => result.push(b'\\'),
76            b'\'' if quote_char == Some(b'\'') => result.push(b'\''),
77            b'"' if quote_char == Some(b'"') => result.push(b'"'),
78            b'$' if quote_char == Some(b'"') => result.push(b'$'),
79            b'n' if quote_char == Some(b'"') => result.push(b'\n'),
80            b't' if quote_char == Some(b'"') => result.push(b'\t'),
81            b'r' if quote_char == Some(b'"') => result.push(b'\r'),
82            b'v' if quote_char == Some(b'"') => result.push(0x0B),
83            b'e' if quote_char == Some(b'"') => result.push(0x1B),
84            b'f' if quote_char == Some(b'"') => result.push(0x0C),
85            b'x' if quote_char == Some(b'"') => {
86                let mut hex_val = 0u8;
87                let mut hex_len = 0;
88                let mut j = i + 2;
89                while hex_len < 2 && j < content.len() {
90                    let c = content[j];
91                    let digit = if c.is_ascii_digit() {
92                        c - b'0'
93                    } else if (b'a'..=b'f').contains(&c) {
94                        c - b'a' + 10
95                    } else if (b'A'..=b'F').contains(&c) {
96                        c - b'A' + 10
97                    } else {
98                        break;
99                    };
100                    hex_val = hex_val * 16 + digit;
101                    hex_len += 1;
102                    j += 1;
103                }
104                if hex_len > 0 {
105                    result.push(hex_val);
106                    consumed = 2 + hex_len;
107                } else {
108                    // Invalid `\x` sequence, treat as literal `\x`
109                    result.push(b'\\');
110                    result.push(b'x');
111                }
112            }
113            c if quote_char == Some(b'"') && c.is_ascii_digit() => {
114                let mut octal_val = 0u16;
115                let mut octal_len = 0;
116                let mut j = i + 1;
117                while octal_len < 3 && j < content.len() {
118                    let d = content[j];
119                    if d.is_ascii_digit() && d <= b'7' {
120                        octal_val = octal_val * 8 + u16::from(d - b'0');
121                        octal_len += 1;
122                        j += 1;
123                    } else {
124                        break;
125                    }
126                }
127                if octal_len > 0 {
128                    // Truncate to u8 (matches PHP behavior for octal sequences > 255)
129                    result.push(octal_val as u8);
130                    consumed = 1 + octal_len;
131                } else {
132                    result.push(b'\\');
133                    result.push(next);
134                }
135            }
136            _ => {
137                // Unrecognized escape sequence
138                result.push(b'\\');
139                result.push(next);
140            }
141        }
142
143        i += consumed;
144    }
145
146    Some(result.into_bump_slice())
147}
148
149/// Parses a PHP literal float, handling underscore separators.
150#[inline]
151#[must_use]
152pub fn parse_literal_float(value: &[u8]) -> Option<f64> {
153    if memchr::memchr(b'_', value).is_none() {
154        return std::str::from_utf8(value).ok()?.parse::<f64>().ok();
155    }
156
157    let mut buf = [0u8; 64];
158    let mut len = 0;
159
160    for &b in value {
161        if b != b'_' {
162            if len < 64 {
163                buf[len] = b;
164                len += 1;
165            } else {
166                let source: std::vec::Vec<u8> = value.iter().copied().filter(|&b| b != b'_').collect();
167                return std::str::from_utf8(&source).ok()?.parse::<f64>().ok();
168            }
169        }
170    }
171
172    std::str::from_utf8(&buf[..len]).ok()?.parse::<f64>().ok()
173}
174
175/// Parses a PHP literal integer with support for binary, octal, decimal, and hex.
176///
177/// Optimized to use byte-level iteration instead of Unicode chars.
178#[inline]
179#[must_use]
180pub fn parse_literal_integer(bytes: &[u8]) -> Option<u64> {
181    if bytes.is_empty() {
182        return None;
183    }
184
185    let (radix, start) = match bytes {
186        [b'0', b'x' | b'X', ..] => (16u128, 2),
187        [b'0', b'o' | b'O', ..] => (8u128, 2),
188        [b'0', b'b' | b'B', ..] => (2u128, 2),
189        [b'0', _, ..] if bytes[1..].iter().all(|&b| b == b'_' || (b'0'..=b'7').contains(&b)) => (8u128, 1), // Legacy octal
190        [b'0', _, ..] => (10u128, 0), // Invalid octal (contains 8/9), treat as decimal
191        _ => (10u128, 0),
192    };
193
194    let mut result: u128 = 0;
195    let mut has_digits = false;
196
197    for &b in &bytes[start..] {
198        if b == b'_' {
199            continue;
200        }
201
202        let digit = if b.is_ascii_digit() {
203            (b - b'0') as u128
204        } else if (b'a'..=b'f').contains(&b) {
205            (b - b'a' + 10) as u128
206        } else if (b'A'..=b'F').contains(&b) {
207            (b - b'A' + 10) as u128
208        } else {
209            return None;
210        };
211
212        if digit >= radix {
213            return None;
214        }
215
216        has_digits = true;
217
218        result = match result.checked_mul(radix) {
219            Some(r) => r,
220            None => return Some(u64::MAX),
221        };
222
223        result = match result.checked_add(digit) {
224            Some(r) => r,
225            None => return Some(u64::MAX),
226        };
227    }
228
229    if !has_digits {
230        return None;
231    }
232
233    Some(result.min(u64::MAX as u128) as u64)
234}
235
236/// Lookup table for identifier start characters (a-z, A-Z, _)
237/// Index by byte value, true if valid start of identifier
238static IS_IDENT_START: [bool; 256] = {
239    let mut table = [false; 256];
240    let mut i = 0u8;
241    loop {
242        table[i as usize] = matches!(i, b'a'..=b'z' | b'A'..=b'Z' | b'_');
243        if i == 255 {
244            break;
245        }
246        i += 1;
247    }
248
249    table
250};
251
252/// Lookup table for identifier continuation characters (a-z, A-Z, 0-9, _, or >= 0x80)
253/// Index by byte value, true if valid part of identifier
254static IS_IDENT_PART: [bool; 256] = {
255    let mut table = [false; 256];
256    let mut i = 0u8;
257    loop {
258        table[i as usize] = matches!(i, b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | 0x80..=0xFF);
259        if i == 255 {
260            break;
261        }
262        i += 1;
263    }
264    table
265};
266
267/// Check if a byte can start an identifier (a-z, A-Z, _)
268#[inline(always)]
269#[must_use]
270pub const fn is_start_of_identifier(byte: &u8) -> bool {
271    IS_IDENT_START[*byte as usize]
272}
273
274/// Check if a byte can be part of an identifier (a-z, A-Z, 0-9, _, or >= 0x80)
275#[inline(always)]
276#[must_use]
277pub const fn is_part_of_identifier(byte: &u8) -> bool {
278    IS_IDENT_PART[*byte as usize]
279}
280
281/// Scans an identifier starting at `offset` in the byte slice and returns the length.
282///
283/// Assumes the first byte is already validated as a start of identifier.
284/// Returns the total length of the identifier (including the first byte).
285/// Stops at the first byte that is not a valid identifier character.
286#[inline(always)]
287#[must_use]
288pub fn scan_identifier_length(bytes: &[u8], offset: usize) -> usize {
289    let mut len = 1;
290    let remaining = &bytes[offset + 1..];
291
292    for &b in remaining {
293        if IS_IDENT_PART[b as usize] {
294            len += 1;
295        } else {
296            break;
297        }
298    }
299
300    len
301}
302
303/// Reads a sequence of bytes representing digits in a specific numerical base.
304///
305/// This utility function iterates through the input byte slice, consuming bytes
306/// as long as they represent valid digits for the given `base`. It handles
307/// decimal digits ('0'-'9') and hexadecimal digits ('a'-'f', 'A'-'F').
308///
309/// It stops consuming at the first byte that is not a valid digit character,
310/// or is a digit character whose value is greater than or equal to the specified `base`
311/// (e.g., '8' in base 8, or 'A' in base 10).
312///
313/// This function is primarily intended as a helper for lexer implementations
314/// when tokenizing the digit part of number literals (binary, octal, decimal, hexadecimal).
315///
316/// # Arguments
317///
318/// * `input` - A byte slice starting at the potential first digit of the number.
319/// * `base` - The numerical base (e.g., 2, 8, 10, 16) to use for validating digits.
320///   Must be between 2 and 36 (inclusive) for hex characters to be potentially valid.
321///
322/// # Returns
323///
324/// The number of bytes (`usize`) consumed from the beginning of the `input` slice
325/// that constitute a valid sequence of digits for the specified `base`. Returns 0 if
326/// the first byte is not a valid digit for the base.
327#[inline]
328pub fn read_digits_of_base(input: &Input, offset: usize, base: u8) -> usize {
329    if base == 16 {
330        read_digits_with(input, offset, u8::is_ascii_hexdigit)
331    } else {
332        let max = b'0' + base;
333
334        read_digits_with(input, offset, |b| b >= &b'0' && b < &max)
335    }
336}
337
338#[inline]
339fn read_digits_with<F>(input: &Input, offset: usize, is_digit: F) -> usize
340where
341    F: Fn(&u8) -> bool,
342{
343    let bytes = input.bytes;
344    let total = input.length;
345    let start = input.offset;
346    let mut pos = start + offset; // Compute the absolute position.
347
348    while pos < total {
349        let current = bytes[pos];
350        if is_digit(&current) {
351            pos += 1;
352        } else if pos + 1 < total && bytes[pos] == number_separator!() && is_digit(&bytes[pos + 1]) {
353            pos += 2; // Skip the separator and the digit.
354        } else {
355            break;
356        }
357    }
358
359    // Return the relative length from the start of the current position.
360    pos - start
361}
362
363#[cfg(test)]
364mod tests {
365    use super::*;
366
367    macro_rules! parse_int {
368        ($input:expr, $expected:expr) => {
369            assert_eq!(parse_literal_integer($input), $expected);
370        };
371    }
372
373    #[test]
374    fn test_parse_literal_integer() {
375        parse_int!(b"123", Some(123));
376        parse_int!(b"0", Some(0));
377        parse_int!(b"0b1010", Some(10));
378        parse_int!(b"0o17", Some(15));
379        parse_int!(b"0x1A3F", Some(6719));
380        parse_int!(b"0XFF", Some(255));
381        parse_int!(b"0_1_2_3", Some(83));
382        parse_int!(b"0b1_0_1_0", Some(10));
383        parse_int!(b"0o1_7", Some(15));
384        parse_int!(b"0x1_A_3_F", Some(6719));
385        parse_int!(b"", None);
386        parse_int!(b"0xGHI", None);
387        parse_int!(b"0b102", None);
388        parse_int!(b"0o89", None);
389    }
390}