oak_core/lexer/
scan_string.rs

1use super::LexerState;
2use crate::{
3    Language,
4    source::{SimdScanner, Source},
5};
6
7/// Configuration for string literal scanning.
8pub struct StringConfig {
9    /// Characters that can start and end a string (e.g., '"', '\'').
10    pub quotes: &'static [char],
11    /// Character used for escaping (e.g., '\\').
12    pub escape: Option<char>,
13}
14
15impl StringConfig {
16    /// Scans for a string literal at the current position in the lexer state.
17    pub fn scan<S: Source + ?Sized, L: Language>(&self, state: &mut LexerState<S, L>, kind: L::TokenType) -> bool {
18        let start = state.get_position();
19        let quote = match state.current() {
20            Some(c) if self.quotes.contains(&c) => c,
21            _ => return false,
22        };
23
24        state.advance(quote.len_utf8());
25
26        // Fast path for ASCII strings
27        if quote.is_ascii() && self.escape.map_or(true, |c| c.is_ascii()) {
28            let q_byte = quote as u8;
29            let e_byte = self.escape.map(|c| c as u8).unwrap_or(q_byte);
30
31            loop {
32                let (rest_len, found_info) = {
33                    let rest = state.rest();
34                    if rest.is_empty() {
35                        (0, None)
36                    }
37                    else {
38                        let bytes = rest.as_bytes();
39                        if let Some(pos) = find_first_of_4(bytes, q_byte, e_byte, q_byte, e_byte) { (rest.len(), Some((pos, bytes[pos]))) } else { (rest.len(), None) }
40                    }
41                };
42
43                if rest_len == 0 {
44                    break;
45                }
46
47                if let Some((pos, found)) = found_info {
48                    state.advance(pos);
49                    if found == q_byte {
50                        state.advance(1);
51                        state.add_token(kind, start, state.get_position());
52                        return true;
53                    }
54                    else {
55                        state.advance(1);
56                        if let Some(next) = state.current() {
57                            state.advance(next.len_utf8());
58                        }
59                    }
60                }
61                else {
62                    state.advance(rest_len);
63                }
64            }
65
66            // Unterminated string
67            state.add_token(kind, start, state.get_position());
68            return true;
69        }
70
71        while let Some(ch) = state.current() {
72            if Some(ch) == self.escape {
73                state.advance(ch.len_utf8());
74                if let Some(next) = state.current() {
75                    state.advance(next.len_utf8());
76                }
77            }
78            else if ch == quote {
79                state.advance(ch.len_utf8());
80                state.add_token(kind, start, state.get_position());
81                return true;
82            }
83            else {
84                state.advance(ch.len_utf8());
85            }
86        }
87
88        // Unterminated string
89        state.add_token(kind, start, state.get_position());
90        true
91    }
92}
93
94/// Finds the first occurrence of any of the four given bytes.
95#[inline]
96pub fn find_first_of_4(bytes: &[u8], a: u8, b: u8, c: u8, d: u8) -> Option<usize> {
97    SimdScanner::find_first_of_4(bytes, a, b, c, d)
98}