mago_syntax_core/
utils.rs

1use bumpalo::Bump;
2use bumpalo::collections::Vec;
3
4use crate::input::Input;
5use crate::number_separator;
6
7/// Parses a PHP literal string, handling all escape sequences, and allocates the result in an arena.
8///
9/// # Returns
10///
11/// An `Option` containing the parsed `&'arena str` or `None` if the input is invalid.
12pub fn parse_literal_string_in<'arena>(
13    arena: &'arena Bump,
14    s: &'arena str,
15    quote_char: Option<char>,
16    has_quote: bool,
17) -> Option<&'arena str> {
18    if s.is_empty() {
19        return Some("");
20    }
21
22    let (quote_char, content) = if let Some(quote_char) = quote_char {
23        (Some(quote_char), s)
24    } else if !has_quote {
25        (None, s)
26    } else if s.starts_with('"') && s.ends_with('"') && s.len() >= 2 {
27        (Some('"'), &s[1..s.len() - 1])
28    } else if s.starts_with('\'') && s.ends_with('\'') && s.len() >= 2 {
29        (Some('\''), &s[1..s.len() - 1])
30    } else {
31        return None;
32    };
33
34    let needs_processing = content.contains('\\') || quote_char.is_some_and(|q| content.contains(q));
35    if !needs_processing {
36        return Some(content);
37    }
38
39    let mut result = Vec::with_capacity_in(content.len(), arena);
40    let mut chars = content.chars().peekable();
41    let mut buf = [0; 4];
42
43    while let Some(c) = chars.next() {
44        if c != '\\' {
45            result.extend_from_slice(c.encode_utf8(&mut buf).as_bytes());
46            continue;
47        }
48
49        let Some(&next_char) = chars.peek() else {
50            result.push(b'\\');
51            continue;
52        };
53
54        let mut consumed = true;
55
56        match next_char {
57            '\\' => result.push(b'\\'),
58            '\'' if quote_char == Some('\'') => result.push(b'\''),
59            '"' if quote_char == Some('"') => result.push(b'"'),
60            '$' if quote_char == Some('"') => result.push(b'$'),
61            'n' if quote_char == Some('"') => result.push(b'\n'),
62            't' if quote_char == Some('"') => result.push(b'\t'),
63            'r' if quote_char == Some('"') => result.push(b'\r'),
64            'v' if quote_char == Some('"') => result.push(0x0B),
65            'e' if quote_char == Some('"') => result.push(0x1B),
66            'f' if quote_char == Some('"') => result.push(0x0C),
67            '0' if quote_char == Some('"') => result.push(0x00),
68            'x' if quote_char == Some('"') => {
69                chars.next(); // Consume 'x'
70                let mut hex_val = 0u8;
71                let mut hex_len = 0;
72                // Peek up to 2 hex digits
73                while let Some(peeked) = chars.peek() {
74                    if hex_len < 2 && peeked.is_ascii_hexdigit() {
75                        hex_val = hex_val * 16 + peeked.to_digit(16).unwrap() as u8;
76                        hex_len += 1;
77                        chars.next(); // Consume the digit
78                    } else {
79                        break;
80                    }
81                }
82                if hex_len > 0 {
83                    result.push(hex_val);
84                } else {
85                    // Invalid `\x` sequence, treat as literal `\x`
86                    result.push(b'\\');
87                    result.push(b'x');
88                }
89
90                consumed = false;
91            }
92            c if quote_char == Some('"') && c.is_ascii_digit() => {
93                let mut octal_val = 0u8;
94                let mut octal_len = 0;
95
96                while let Some(peeked) = chars.peek() {
97                    if octal_len < 3 && peeked.is_ascii_digit() && *peeked <= '7' {
98                        octal_val = octal_val * 8 + peeked.to_digit(8).unwrap() as u8;
99                        octal_len += 1;
100                        chars.next(); // Consume the digit
101                    } else {
102                        break;
103                    }
104                }
105                if octal_len > 0 {
106                    result.push(octal_val);
107                } else {
108                    result.push(b'\\');
109                    result.push(b'0');
110                }
111
112                consumed = false;
113            }
114            _ => {
115                // Unrecognized escape sequence
116                if quote_char == Some('\'') {
117                    // In single quotes, only \' and \\ are special.
118                    result.push(b'\\');
119                    result.extend_from_slice(next_char.encode_utf8(&mut buf).as_bytes());
120                } else {
121                    // In double quotes, an invalid escape is just the character.
122                    result.extend_from_slice(next_char.encode_utf8(&mut buf).as_bytes());
123                }
124            }
125        }
126
127        if consumed {
128            chars.next(); // Consume the character after the backslash
129        }
130    }
131
132    std::str::from_utf8(result.into_bump_slice()).ok()
133}
134
135/// Parses a PHP literal string, handling all escape sequences, and returns the result as a `String`.
136///
137/// # Returns
138///
139/// An `Option<String>` containing the parsed string or `None` if the input is invalid.
140///
141/// # Notes
142///
143/// This function is similar to `parse_literal_string_in`, but it allocates the result on the heap instead of in an arena.
144/// It is recommended to use `parse_literal_string_in` when possible for better performance in contexts where an arena is available.
145#[inline]
146pub fn parse_literal_string(s: &str, quote_char: Option<char>, has_quote: bool) -> Option<String> {
147    if s.is_empty() {
148        return Some(String::new());
149    }
150
151    let (quote_char, content) = if let Some(quote_char) = quote_char {
152        (Some(quote_char), s)
153    } else if !has_quote {
154        (None, s)
155    } else if s.starts_with('"') && s.ends_with('"') && s.len() >= 2 {
156        (Some('"'), &s[1..s.len() - 1])
157    } else if s.starts_with('\'') && s.ends_with('\'') && s.len() >= 2 {
158        (Some('\''), &s[1..s.len() - 1])
159    } else {
160        return None;
161    };
162
163    let mut result = String::new();
164    let mut chars = content.chars().peekable();
165
166    while let Some(c) = chars.next() {
167        if c != '\\' {
168            result.push(c);
169
170            continue;
171        }
172
173        let Some(&next_char) = chars.peek() else {
174            result.push(c);
175
176            continue;
177        };
178
179        match next_char {
180            '\\' => {
181                result.push('\\');
182                chars.next();
183            }
184            '\'' if quote_char == Some('\'') => {
185                result.push('\'');
186                chars.next();
187            }
188            '"' if quote_char == Some('"') => {
189                result.push('"');
190                chars.next();
191            }
192            'n' if quote_char == Some('"') => {
193                result.push('\n');
194                chars.next();
195            }
196            't' if quote_char == Some('"') => {
197                result.push('\t');
198                chars.next();
199            }
200            'r' if quote_char == Some('"') => {
201                result.push('\r');
202                chars.next();
203            }
204            'v' if quote_char == Some('"') => {
205                result.push('\x0B');
206                chars.next();
207            }
208            'e' if quote_char == Some('"') => {
209                result.push('\x1B');
210                chars.next();
211            }
212            'f' if quote_char == Some('"') => {
213                result.push('\x0C');
214                chars.next();
215            }
216            '0' if quote_char == Some('"') => {
217                result.push('\0');
218                chars.next();
219            }
220            'x' if quote_char == Some('"') => {
221                chars.next();
222
223                let mut hex_chars = String::new();
224                for _ in 0..2 {
225                    if let Some(&next) = chars.peek() {
226                        if next.is_ascii_hexdigit() {
227                            hex_chars.push(chars.next().unwrap());
228                        } else {
229                            break;
230                        }
231                    }
232                }
233
234                if !hex_chars.is_empty() {
235                    match u8::from_str_radix(&hex_chars, 16) {
236                        Ok(byte_val) => result.push(byte_val as char),
237                        Err(_) => {
238                            return None;
239                        }
240                    }
241                } else {
242                    return None;
243                }
244            }
245            c if quote_char == Some('"') && c.is_ascii_digit() => {
246                let mut octal = String::new();
247                octal.push(chars.next().unwrap());
248
249                for _ in 0..2 {
250                    if let Some(&next) = chars.peek() {
251                        if next.is_ascii_digit() && next <= '7' {
252                            octal.push(chars.next().unwrap());
253                        } else {
254                            break;
255                        }
256                    }
257                }
258
259                result.push(u8::from_str_radix(&octal, 8).ok()? as char);
260            }
261            '$' if quote_char == Some('"') => {
262                result.push('$');
263                chars.next();
264            }
265            _ => {
266                if quote_char == Some('\'') {
267                    result.push(c);
268                    result.push(next_char);
269                    chars.next();
270                } else {
271                    result.push(c);
272                }
273            }
274        }
275    }
276
277    Some(result)
278}
279
280#[inline]
281pub fn parse_literal_float(value: &str) -> Option<f64> {
282    let source = value.replace("_", "");
283
284    source.parse::<f64>().ok()
285}
286
287#[inline]
288pub fn parse_literal_integer(value: &str) -> Option<u64> {
289    if value.is_empty() {
290        return None;
291    }
292
293    let mut s = value;
294    let radix = if s.starts_with("0x") || s.starts_with("0X") {
295        s = &s[2..];
296        16
297    } else if s.starts_with("0o") || s.starts_with("0O") {
298        s = &s[2..];
299        8
300    } else if s.starts_with("0b") || s.starts_with("0B") {
301        s = &s[2..];
302        2
303    } else {
304        10
305    };
306
307    let mut result: u128 = 0;
308    let mut has_digits = false;
309
310    for c in s.chars() {
311        if c == '_' {
312            continue;
313        }
314
315        let digit = match c.to_digit(radix) {
316            Some(d) => d as u128,
317            None => return None,
318        };
319
320        has_digits = true;
321
322        result = match result.checked_mul(radix as u128) {
323            Some(r) => r,
324            None => return Some(u64::MAX),
325        };
326        result = match result.checked_add(digit) {
327            Some(r) => r,
328            None => return Some(u64::MAX),
329        };
330    }
331
332    if !has_digits {
333        return None;
334    }
335
336    // Clamp the result to u64::MAX if it's too large.
337    Some(if result > u64::MAX as u128 { u64::MAX } else { result as u64 })
338}
339
340#[inline]
341pub fn is_start_of_identifier(byte: &u8) -> bool {
342    byte.is_ascii_lowercase() || byte.is_ascii_uppercase() || (*byte == b'_')
343}
344
345#[inline]
346pub fn is_part_of_identifier(byte: &u8) -> bool {
347    byte.is_ascii_digit()
348        || byte.is_ascii_lowercase()
349        || byte.is_ascii_uppercase()
350        || (*byte == b'_')
351        || (*byte >= 0x80)
352}
353
354/// Reads a sequence of bytes representing digits in a specific numerical base.
355///
356/// This utility function iterates through the input byte slice, consuming bytes
357/// as long as they represent valid digits for the given `base`. It handles
358/// decimal digits ('0'-'9') and hexadecimal digits ('a'-'f', 'A'-'F').
359///
360/// It stops consuming at the first byte that is not a valid digit character,
361/// or is a digit character whose value is greater than or equal to the specified `base`
362/// (e.g., '8' in base 8, or 'A' in base 10).
363///
364/// This function is primarily intended as a helper for lexer implementations
365/// when tokenizing the digit part of number literals (binary, octal, decimal, hexadecimal).
366///
367/// # Arguments
368///
369/// * `input` - A byte slice starting at the potential first digit of the number.
370/// * `base` - The numerical base (e.g., 2, 8, 10, 16) to use for validating digits.
371///   Must be between 2 and 36 (inclusive) for hex characters to be potentially valid.
372///
373/// # Returns
374///
375/// The number of bytes (`usize`) consumed from the beginning of the `input` slice
376/// that constitute a valid sequence of digits for the specified `base`. Returns 0 if
377/// the first byte is not a valid digit for the base.
378#[inline]
379pub fn read_digits_of_base(input: &Input, offset: usize, base: u8) -> usize {
380    if base == 16 {
381        read_digits_with(input, offset, u8::is_ascii_hexdigit)
382    } else {
383        let max = b'0' + base;
384
385        read_digits_with(input, offset, |b| b >= &b'0' && b < &max)
386    }
387}
388
389#[inline]
390fn read_digits_with<F: Fn(&u8) -> bool>(input: &Input, offset: usize, is_digit: F) -> usize {
391    let bytes = input.bytes;
392    let total = input.length;
393    let start = input.offset;
394    let mut pos = start + offset; // Compute the absolute position.
395
396    while pos < total {
397        let current = bytes[pos];
398        if is_digit(&current) {
399            pos += 1;
400        } else if pos + 1 < total && bytes[pos] == number_separator!() && is_digit(&bytes[pos + 1]) {
401            pos += 2; // Skip the separator and the digit.
402        } else {
403            break;
404        }
405    }
406
407    // Return the relative length from the start of the current position.
408    pos - start
409}