mago_syntax_core/
utils.rs

1use bumpalo::Bump;
2use bumpalo::collections::Vec;
3
4use crate::input::Input;
5use crate::number_separator;
6
7/// Parses a PHP literal string, handling all escape sequences, and allocates the result in an arena.
8///
9/// # Returns
10///
11/// An `Option` containing the parsed `&'arena str` or `None` if the input is invalid.
12///
13/// # Panics
14///
15/// Panics if internal assumptions about character parsing are violated (e.g., invalid hex or octal digits
16/// after validation). This should not occur with valid PHP strings.
17pub fn parse_literal_string_in<'arena>(
18    arena: &'arena Bump,
19    s: &'arena str,
20    quote_char: Option<char>,
21    has_quote: bool,
22) -> Option<&'arena str> {
23    if s.is_empty() {
24        return Some("");
25    }
26
27    let (quote_char, content) = if let Some(quote_char) = quote_char {
28        (Some(quote_char), s)
29    } else if !has_quote {
30        (None, s)
31    } else if s.starts_with('"') && s.ends_with('"') && s.len() >= 2 {
32        (Some('"'), &s[1..s.len() - 1])
33    } else if s.starts_with('\'') && s.ends_with('\'') && s.len() >= 2 {
34        (Some('\''), &s[1..s.len() - 1])
35    } else {
36        return None;
37    };
38
39    let needs_processing = content.contains('\\') || quote_char.is_some_and(|q| content.contains(q));
40    if !needs_processing {
41        return Some(content);
42    }
43
44    let mut result = Vec::with_capacity_in(content.len(), arena);
45    let mut chars = content.chars().peekable();
46    let mut buf = [0; 4];
47
48    while let Some(c) = chars.next() {
49        if c != '\\' {
50            result.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
51            continue;
52        }
53
54        let Some(&next_char) = chars.peek() else {
55            result.push(b'\\');
56            continue;
57        };
58
59        let mut consumed = true;
60
61        match next_char {
62            '\\' => result.push(b'\\'),
63            '\'' if quote_char == Some('\'') => result.push(b'\''),
64            '"' if quote_char == Some('"') => result.push(b'"'),
65            '$' if quote_char == Some('"') => result.push(b'$'),
66            'n' if quote_char == Some('"') => result.push(b'\n'),
67            't' if quote_char == Some('"') => result.push(b'\t'),
68            'r' if quote_char == Some('"') => result.push(b'\r'),
69            'v' if quote_char == Some('"') => result.push(0x0B),
70            'e' if quote_char == Some('"') => result.push(0x1B),
71            'f' if quote_char == Some('"') => result.push(0x0C),
72            '0' if quote_char == Some('"') => result.push(0x00),
73            'x' if quote_char == Some('"') => {
74                chars.next(); // Consume 'x'
75                let mut hex_val = 0u8;
76                let mut hex_len = 0;
77                // Peek up to 2 hex digits
78                while let Some(peeked) = chars.peek() {
79                    if hex_len < 2 && peeked.is_ascii_hexdigit() {
80                        hex_val = hex_val * 16 + peeked.to_digit(16).unwrap() as u8;
81                        hex_len += 1;
82                        chars.next(); // Consume the digit
83                    } else {
84                        break;
85                    }
86                }
87                if hex_len > 0 {
88                    result.push(hex_val);
89                } else {
90                    // Invalid `\x` sequence, treat as literal `\x`
91                    result.push(b'\\');
92                    result.push(b'x');
93                }
94
95                consumed = false;
96            }
97            c if quote_char == Some('"') && c.is_ascii_digit() => {
98                let mut octal_val = 0u8;
99                let mut octal_len = 0;
100
101                while let Some(peeked) = chars.peek() {
102                    if octal_len < 3 && peeked.is_ascii_digit() && *peeked <= '7' {
103                        octal_val = octal_val * 8 + peeked.to_digit(8).unwrap() as u8;
104                        octal_len += 1;
105                        chars.next(); // Consume the digit
106                    } else {
107                        break;
108                    }
109                }
110                if octal_len > 0 {
111                    result.push(octal_val);
112                } else {
113                    result.push(b'\\');
114                    result.push(b'0');
115                }
116
117                consumed = false;
118            }
119            _ => {
120                // Unrecognized escape sequence
121                if quote_char == Some('\'') {
122                    // In single quotes, only \' and \\ are special.
123                    result.push(b'\\');
124                    result.extend_from_slice(next_char.encode_utf8(&mut buf).as_bytes());
125                } else {
126                    // In double quotes, an invalid escape is just the character.
127                    result.extend_from_slice(next_char.encode_utf8(&mut buf).as_bytes());
128                }
129            }
130        }
131
132        if consumed {
133            chars.next(); // Consume the character after the backslash
134        }
135    }
136
137    std::str::from_utf8(result.into_bump_slice()).ok()
138}
139
140/// Parses a PHP literal string, handling all escape sequences, and returns the result as a `String`.
141///
142/// # Returns
143///
144/// An `Option<String>` containing the parsed string or `None` if the input is invalid.
145///
146/// # Notes
147///
148/// This function is similar to `parse_literal_string_in`, but it allocates the result on the heap instead of in an arena.
149/// It is recommended to use `parse_literal_string_in` when possible for better performance in contexts where an arena is available.
150///
151/// # Panics
152///
153/// Panics if internal assumptions about character parsing are violated (e.g., invalid hex or octal digits
154/// after validation). This should not occur with valid PHP strings.
155#[inline]
156#[must_use]
157pub fn parse_literal_string(s: &str, quote_char: Option<char>, has_quote: bool) -> Option<String> {
158    if s.is_empty() {
159        return Some(String::new());
160    }
161
162    let (quote_char, content) = if let Some(quote_char) = quote_char {
163        (Some(quote_char), s)
164    } else if !has_quote {
165        (None, s)
166    } else if s.starts_with('"') && s.ends_with('"') && s.len() >= 2 {
167        (Some('"'), &s[1..s.len() - 1])
168    } else if s.starts_with('\'') && s.ends_with('\'') && s.len() >= 2 {
169        (Some('\''), &s[1..s.len() - 1])
170    } else {
171        return None;
172    };
173
174    let mut result = String::new();
175    let mut chars = content.chars().peekable();
176
177    while let Some(c) = chars.next() {
178        if c != '\\' {
179            result.push(c);
180
181            continue;
182        }
183
184        let Some(&next_char) = chars.peek() else {
185            result.push(c);
186
187            continue;
188        };
189
190        match next_char {
191            '\\' => {
192                result.push('\\');
193                chars.next();
194            }
195            '\'' if quote_char == Some('\'') => {
196                result.push('\'');
197                chars.next();
198            }
199            '"' if quote_char == Some('"') => {
200                result.push('"');
201                chars.next();
202            }
203            'n' if quote_char == Some('"') => {
204                result.push('\n');
205                chars.next();
206            }
207            't' if quote_char == Some('"') => {
208                result.push('\t');
209                chars.next();
210            }
211            'r' if quote_char == Some('"') => {
212                result.push('\r');
213                chars.next();
214            }
215            'v' if quote_char == Some('"') => {
216                result.push('\x0B');
217                chars.next();
218            }
219            'e' if quote_char == Some('"') => {
220                result.push('\x1B');
221                chars.next();
222            }
223            'f' if quote_char == Some('"') => {
224                result.push('\x0C');
225                chars.next();
226            }
227            '0' if quote_char == Some('"') => {
228                result.push('\0');
229                chars.next();
230            }
231            'x' if quote_char == Some('"') => {
232                chars.next();
233
234                let mut hex_chars = String::new();
235                for _ in 0..2 {
236                    if let Some(&next) = chars.peek() {
237                        if next.is_ascii_hexdigit() {
238                            hex_chars.push(chars.next().unwrap());
239                        } else {
240                            break;
241                        }
242                    }
243                }
244
245                if hex_chars.is_empty() {
246                    return None;
247                }
248                match u8::from_str_radix(&hex_chars, 16) {
249                    Ok(byte_val) => result.push(byte_val as char),
250                    Err(_) => {
251                        return None;
252                    }
253                }
254            }
255            c if quote_char == Some('"') && c.is_ascii_digit() => {
256                let mut octal = String::new();
257                octal.push(chars.next().unwrap());
258
259                for _ in 0..2 {
260                    if let Some(&next) = chars.peek() {
261                        if next.is_ascii_digit() && next <= '7' {
262                            octal.push(chars.next().unwrap());
263                        } else {
264                            break;
265                        }
266                    }
267                }
268
269                result.push(u8::from_str_radix(&octal, 8).ok()? as char);
270            }
271            '$' if quote_char == Some('"') => {
272                result.push('$');
273                chars.next();
274            }
275            _ => {
276                if quote_char == Some('\'') {
277                    result.push(c);
278                    result.push(next_char);
279                    chars.next();
280                } else {
281                    result.push(c);
282                }
283            }
284        }
285    }
286
287    Some(result)
288}
289
290#[inline]
291#[must_use]
292pub fn parse_literal_float(value: &str) -> Option<f64> {
293    let source = value.replace('_', "");
294
295    source.parse::<f64>().ok()
296}
297
298#[inline]
299#[must_use]
300pub fn parse_literal_integer(value: &str) -> Option<u64> {
301    if value.is_empty() {
302        return None;
303    }
304
305    let mut s = value;
306    let radix = if s.starts_with("0x") || s.starts_with("0X") {
307        s = &s[2..];
308        16
309    } else if s.starts_with("0o") || s.starts_with("0O") {
310        s = &s[2..];
311        8
312    } else if s.starts_with("0b") || s.starts_with("0B") {
313        s = &s[2..];
314        2
315    } else if s.starts_with('0') && s.len() > 1 {
316        s = &s[1..];
317        8
318    } else {
319        10
320    };
321
322    let mut result: u128 = 0;
323    let mut has_digits = false;
324
325    for c in s.chars() {
326        if c == '_' {
327            continue;
328        }
329
330        let digit = match c.to_digit(radix) {
331            Some(d) => u128::from(d),
332            None => return None,
333        };
334
335        has_digits = true;
336
337        result = match result.checked_mul(u128::from(radix)) {
338            Some(r) => r,
339            None => return Some(u64::MAX),
340        };
341        result = match result.checked_add(digit) {
342            Some(r) => r,
343            None => return Some(u64::MAX),
344        };
345    }
346
347    if !has_digits {
348        return None;
349    }
350
351    // Clamp the result to u64::MAX if it's too large.
352    Some(if result > u128::from(u64::MAX) { u64::MAX } else { result as u64 })
353}
354
355#[inline]
356#[must_use]
357pub fn is_start_of_identifier(byte: &u8) -> bool {
358    byte.is_ascii_lowercase() || byte.is_ascii_uppercase() || (*byte == b'_')
359}
360
361#[inline]
362#[must_use]
363pub fn is_part_of_identifier(byte: &u8) -> bool {
364    byte.is_ascii_digit()
365        || byte.is_ascii_lowercase()
366        || byte.is_ascii_uppercase()
367        || (*byte == b'_')
368        || (*byte >= 0x80)
369}
370
371/// Reads a sequence of bytes representing digits in a specific numerical base.
372///
373/// This utility function iterates through the input byte slice, consuming bytes
374/// as long as they represent valid digits for the given `base`. It handles
375/// decimal digits ('0'-'9') and hexadecimal digits ('a'-'f', 'A'-'F').
376///
377/// It stops consuming at the first byte that is not a valid digit character,
378/// or is a digit character whose value is greater than or equal to the specified `base`
379/// (e.g., '8' in base 8, or 'A' in base 10).
380///
381/// This function is primarily intended as a helper for lexer implementations
382/// when tokenizing the digit part of number literals (binary, octal, decimal, hexadecimal).
383///
384/// # Arguments
385///
386/// * `input` - A byte slice starting at the potential first digit of the number.
387/// * `base` - The numerical base (e.g., 2, 8, 10, 16) to use for validating digits.
388///   Must be between 2 and 36 (inclusive) for hex characters to be potentially valid.
389///
390/// # Returns
391///
392/// The number of bytes (`usize`) consumed from the beginning of the `input` slice
393/// that constitute a valid sequence of digits for the specified `base`. Returns 0 if
394/// the first byte is not a valid digit for the base.
395#[inline]
396pub fn read_digits_of_base(input: &Input, offset: usize, base: u8) -> usize {
397    if base == 16 {
398        read_digits_with(input, offset, u8::is_ascii_hexdigit)
399    } else {
400        let max = b'0' + base;
401
402        read_digits_with(input, offset, |b| b >= &b'0' && b < &max)
403    }
404}
405
406#[inline]
407fn read_digits_with<F: Fn(&u8) -> bool>(input: &Input, offset: usize, is_digit: F) -> usize {
408    let bytes = input.bytes;
409    let total = input.length;
410    let start = input.offset;
411    let mut pos = start + offset; // Compute the absolute position.
412
413    while pos < total {
414        let current = bytes[pos];
415        if is_digit(&current) {
416            pos += 1;
417        } else if pos + 1 < total && bytes[pos] == number_separator!() && is_digit(&bytes[pos + 1]) {
418            pos += 2; // Skip the separator and the digit.
419        } else {
420            break;
421        }
422    }
423
424    // Return the relative length from the start of the current position.
425    pos - start
426}
427
428#[cfg(test)]
429mod tests {
430    use super::*;
431
432    macro_rules! parse_int {
433        ($input:expr, $expected:expr) => {
434            assert_eq!(parse_literal_integer($input), $expected);
435        };
436    }
437
438    #[test]
439    fn test_parse_literal_integer() {
440        parse_int!("123", Some(123));
441        parse_int!("0", Some(0));
442        parse_int!("0b1010", Some(10));
443        parse_int!("0o17", Some(15));
444        parse_int!("0x1A3F", Some(6719));
445        parse_int!("0XFF", Some(255));
446        parse_int!("0_1_2_3", Some(83));
447        parse_int!("0b1_0_1_0", Some(10));
448        parse_int!("0o1_7", Some(15));
449        parse_int!("0x1_A_3_F", Some(6719));
450        parse_int!("", None);
451        parse_int!("0xGHI", None);
452        parse_int!("0b102", None);
453        parse_int!("0o89", None);
454    }
455}