Skip to main content

mago_type_syntax/lexer/
mod.rs

1mod keyword;
2
3use mago_database::file::FileId;
4use mago_database::file::HasFileId;
5use mago_span::Position;
6use mago_syntax_core::float_exponent;
7use mago_syntax_core::float_separator;
8use mago_syntax_core::input::Input;
9use mago_syntax_core::number_sign;
10use mago_syntax_core::part_of_identifier;
11use mago_syntax_core::start_of_binary_number;
12use mago_syntax_core::start_of_float_number;
13use mago_syntax_core::start_of_hexadecimal_number;
14use mago_syntax_core::start_of_identifier;
15use mago_syntax_core::start_of_octal_number;
16use mago_syntax_core::start_of_octal_or_float_number;
17use mago_syntax_core::utils::read_digits_of_base;
18
19use crate::error::SyntaxError;
20use crate::token::TypeToken;
21use crate::token::TypeTokenKind;
22
23#[derive(Debug)]
24pub struct TypeLexer<'arena> {
25    input: Input<'arena>,
26}
27
28impl<'arena> TypeLexer<'arena> {
29    #[inline]
30    #[must_use]
31    pub fn new(input: Input<'arena>) -> TypeLexer<'arena> {
32        TypeLexer { input }
33    }
34
35    #[inline]
36    #[must_use]
37    pub fn has_reached_eof(&self) -> bool {
38        self.input.has_reached_eof()
39    }
40
41    #[inline]
42    #[must_use]
43    pub fn current_position(&self) -> Position {
44        self.input.current_position()
45    }
46
47    #[inline]
48    #[must_use]
49    pub fn slice_in_range(&self, from: u32, to: u32) -> &'arena str {
50        let bytes_slice = self.input.slice_in_range(from, to);
51        bytes_slice.utf8_chunks().next().map_or("", |chunk| chunk.valid())
52    }
53
54    #[inline]
55    pub fn advance(&mut self) -> Option<Result<TypeToken<'arena>, SyntaxError>> {
56        if self.input.has_reached_eof() {
57            return None;
58        }
59
60        let start = self.input.current_position();
61        let whitespaces = self.input.consume_whitespaces();
62        if !whitespaces.is_empty() {
63            let end = self.input.current_position();
64            return Some(Ok(self.token(TypeTokenKind::Whitespace, whitespaces, start, end)));
65        }
66
67        let remaining = self.input.read_remaining();
68        // SAFETY: has_reached_eof() was checked at the top; remaining is non-empty.
69        let first = unsafe { *remaining.get_unchecked(0) };
70        let second = remaining.get(1).copied();
71
72        let (kind, length) = match first {
73            b'*' => (TypeTokenKind::Asterisk, 1),
74            b':' => {
75                if second == Some(b':') {
76                    (TypeTokenKind::ColonColon, 2)
77                } else {
78                    (TypeTokenKind::Colon, 1)
79                }
80            }
81            b'=' => (TypeTokenKind::Equals, 1),
82            b'?' => (TypeTokenKind::Question, 1),
83            b'!' => (TypeTokenKind::Exclamation, 1),
84            b'&' => (TypeTokenKind::Ampersand, 1),
85            b'|' => (TypeTokenKind::Pipe, 1),
86            b'>' => (TypeTokenKind::GreaterThan, 1),
87            b'<' => (TypeTokenKind::LessThan, 1),
88            b'(' => (TypeTokenKind::LeftParenthesis, 1),
89            b')' => (TypeTokenKind::RightParenthesis, 1),
90            b'[' => (TypeTokenKind::LeftBracket, 1),
91            b']' => (TypeTokenKind::RightBracket, 1),
92            b'{' => (TypeTokenKind::LeftBrace, 1),
93            b'}' => (TypeTokenKind::RightBrace, 1),
94            b',' => (TypeTokenKind::Comma, 1),
95            b'+' => (TypeTokenKind::Plus, 1),
96            b'-' => (TypeTokenKind::Minus, 1),
97            b'.' => match remaining.get(..3) {
98                Some([b'.', b'.', b'.']) => (TypeTokenKind::Ellipsis, 3),
99                _ if matches!(second, Some(b'0'..=b'9')) => self.read_decimal(),
100                _ => {
101                    return Some(Err(SyntaxError::UnrecognizedToken(
102                        self.file_id(),
103                        first,
104                        self.input.current_position(),
105                    )));
106                }
107            },
108            b'/' if second == Some(b'/') => self.read_single_line_comment(),
109            b'\'' | b'"' => self.read_literal_string(first),
110            b'\\' if second.is_some_and(|b| b.is_ascii_alphabetic() || b == b'_' || b >= 0x80) => {
111                self.read_fully_qualified_identifier()
112            }
113            b'$' if second.is_some_and(|b| b.is_ascii_alphabetic() || b == b'_' || b >= 0x80) => self.read_variable(),
114            b'0'..=b'9' => self.read_number(),
115            b if b.is_ascii_alphabetic() || b == b'_' || b >= 0x80 => self.read_identifier_or_keyword(),
116            _ => {
117                return Some(Err(SyntaxError::UnrecognizedToken(self.file_id(), first, self.input.current_position())));
118            }
119        };
120
121        let buffer = self.input.consume(length);
122        let end = self.input.current_position();
123
124        Some(Ok(self.token(kind, buffer, start, end)))
125    }
126
127    #[inline]
128    fn read_variable(&self) -> (TypeTokenKind, usize) {
129        let mut length = 2;
130        while let [part_of_identifier!(), ..] = self.input.peek(length, 1) {
131            length += 1;
132        }
133        (TypeTokenKind::Variable, length)
134    }
135
136    #[inline]
137    fn read_single_line_comment(&self) -> (TypeTokenKind, usize) {
138        let mut length = 2;
139        loop {
140            match self.input.peek(length, 1) {
141                [b'\n', ..] | [] => break,
142                [_, ..] => length += 1,
143            }
144        }
145        (TypeTokenKind::SingleLineComment, length)
146    }
147
148    #[inline]
149    fn read_decimal(&self) -> (TypeTokenKind, usize) {
150        let mut length = read_digits_of_base(&self.input, 2, 10);
151        if let float_exponent!() = self.input.peek(length, 1) {
152            length += 1;
153            if let number_sign!() = self.input.peek(length, 1) {
154                length += 1;
155            }
156            length = read_digits_of_base(&self.input, length, 10);
157        }
158        (TypeTokenKind::LiteralFloat, length)
159    }
160
161    #[inline]
162    fn read_number(&self) -> (TypeTokenKind, usize) {
163        #[derive(Debug, Clone, Copy, PartialEq, Eq)]
164        enum NumberKind {
165            Integer,
166            Float,
167            OctalOrFloat,
168            IntegerOrFloat,
169        }
170
171        let mut length = 1;
172        let (base, kind): (u8, NumberKind) = match self.input.read(3) {
173            start_of_binary_number!() => {
174                length += 1;
175                (2, NumberKind::Integer)
176            }
177            start_of_octal_number!() => {
178                length += 1;
179                (8, NumberKind::Integer)
180            }
181            start_of_hexadecimal_number!() => {
182                length += 1;
183                (16, NumberKind::Integer)
184            }
185            start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
186            start_of_float_number!() => (10, NumberKind::Float),
187            _ => (10, NumberKind::IntegerOrFloat),
188        };
189
190        if kind != NumberKind::Float {
191            length = read_digits_of_base(&self.input, length, base);
192            if kind == NumberKind::Integer {
193                return (TypeTokenKind::LiteralInteger, length);
194            }
195        }
196
197        let is_float = matches!(self.input.peek(length, 3), float_separator!());
198        if !is_float {
199            return (TypeTokenKind::LiteralInteger, length);
200        }
201
202        if let [b'.'] = self.input.peek(length, 1) {
203            length += 1;
204            length = read_digits_of_base(&self.input, length, 10);
205        }
206
207        if let float_exponent!() = self.input.peek(length, 1) {
208            let mut exp_length = length + 1;
209            if let number_sign!() = self.input.peek(exp_length, 1) {
210                exp_length += 1;
211            }
212
213            let after_exp = read_digits_of_base(&self.input, exp_length, 10);
214            if after_exp > exp_length {
215                length = after_exp;
216            }
217        }
218
219        (TypeTokenKind::LiteralFloat, length)
220    }
221
222    #[inline]
223    fn read_literal_string(&self, quote: u8) -> (TypeTokenKind, usize) {
224        let total = self.input.len();
225        let start = self.input.current_offset();
226        let mut length = 1;
227        let mut last_was_backslash = false;
228        let mut partial = false;
229
230        loop {
231            let pos = start + length;
232            if pos >= total {
233                partial = true;
234                break;
235            }
236
237            let byte = self.input.read_at(pos);
238            if *byte == b'\\' {
239                last_was_backslash = !last_was_backslash;
240                length += 1;
241            } else {
242                if byte == &quote && !last_was_backslash {
243                    length += 1;
244                    break;
245                }
246                length += 1;
247                last_was_backslash = false;
248            }
249        }
250
251        if partial { (TypeTokenKind::PartialLiteralString, length) } else { (TypeTokenKind::LiteralString, length) }
252    }
253
254    #[inline]
255    fn read_fully_qualified_identifier(&self) -> (TypeTokenKind, usize) {
256        let mut length = 2;
257        let mut last_was_slash = false;
258        loop {
259            match self.input.peek(length, 1) {
260                [start_of_identifier!(), ..] if last_was_slash => {
261                    length += 1;
262                    last_was_slash = false;
263                }
264                [part_of_identifier!(), ..] if !last_was_slash => {
265                    length += 1;
266                }
267                [b'\\', ..] => {
268                    if last_was_slash {
269                        length -= 1;
270                        break;
271                    }
272                    length += 1;
273                    last_was_slash = true;
274                }
275                _ => break,
276            }
277        }
278        (TypeTokenKind::FullyQualifiedIdentifier, length)
279    }
280
281    /// Read an identifier or keyword (including compound keywords with hyphens).
282    /// This is the hot path - optimized for common case (simple identifiers).
283    #[inline]
284    fn read_identifier_or_keyword(&self) -> (TypeTokenKind, usize) {
285        let remaining = self.input.read_remaining();
286        let total = remaining.len();
287        let mut length = 1;
288        let mut next_is_hyphen = false;
289        let mut next_is_backslash = false;
290
291        // Scan identifier bytes greedily. Break on `-` or `\\` if the next
292        // byte after them could extend the identifier (hyphen-joined keyword
293        // or namespace separator).
294        while length < total {
295            // SAFETY: length < total guarantees the index is in bounds.
296            let b = unsafe { *remaining.get_unchecked(length) };
297            if mago_syntax_core::utils::is_part_of_identifier(&b) {
298                length += 1;
299                continue;
300            }
301
302            if b == b'-' && length + 1 < total {
303                // SAFETY: `length + 1 < total` was just checked.
304                let b2 = unsafe { *remaining.get_unchecked(length + 1) };
305                if mago_syntax_core::utils::is_part_of_identifier(&b2) {
306                    next_is_hyphen = true;
307                }
308            } else if b == b'\\' && length + 1 < total {
309                // SAFETY: `length + 1 < total` was just checked.
310                let b2 = unsafe { *remaining.get_unchecked(length + 1) };
311                if mago_syntax_core::utils::is_start_of_identifier(&b2) {
312                    next_is_backslash = true;
313                }
314            } else {
315                // Any other byte ends the identifier scan; nothing to record.
316            }
317
318            break;
319        }
320
321        if next_is_backslash {
322            return self.finish_qualified_identifier(length);
323        }
324
325        if !next_is_hyphen {
326            // SAFETY: length <= total (identifier was scanned in bounds).
327            let bytes = unsafe { remaining.get_unchecked(..length) };
328            if let Some(kind) = keyword::lookup_keyword(bytes) {
329                return (kind, length);
330            }
331            return (TypeTokenKind::Identifier, length);
332        }
333
334        let base_len = length;
335        while length < total {
336            // SAFETY: `length < total` was just checked.
337            let b = unsafe { *remaining.get_unchecked(length) };
338            if mago_syntax_core::utils::is_part_of_identifier(&b) {
339                length += 1;
340                continue;
341            }
342
343            if b == b'-' && length + 1 < total {
344                // SAFETY: `length + 1 < total` was just checked.
345                let b2 = unsafe { *remaining.get_unchecked(length + 1) };
346                if mago_syntax_core::utils::is_part_of_identifier(&b2) {
347                    length += 1;
348                    continue;
349                }
350            }
351
352            break;
353        }
354
355        // SAFETY: `length` was only ever advanced while `length < total`, so it is in bounds.
356        let bytes = unsafe { remaining.get_unchecked(..length) };
357        if let Some(kind) = keyword::lookup_keyword(bytes) {
358            return (kind, length);
359        }
360
361        // SAFETY: `base_len <= length <= total`.
362        let base_bytes = unsafe { remaining.get_unchecked(..base_len) };
363        if let Some(kind) = keyword::lookup_keyword(base_bytes) {
364            return (kind, base_len);
365        }
366
367        (TypeTokenKind::Identifier, base_len)
368    }
369
370    /// Continue reading a qualified identifier (with backslashes).
371    #[inline]
372    fn finish_qualified_identifier(&self, start_len: usize) -> (TypeTokenKind, usize) {
373        let mut length = start_len;
374        let mut slashes = 0;
375        let mut last_was_slash = false;
376
377        loop {
378            match self.input.peek(length, 1) {
379                [start_of_identifier!(), ..] if last_was_slash => {
380                    length += 1;
381                    last_was_slash = false;
382                }
383                [part_of_identifier!(), ..] if !last_was_slash => {
384                    length += 1;
385                }
386                [b'\\', ..] => {
387                    if last_was_slash {
388                        length -= 1;
389                        slashes -= 1;
390                        break;
391                    }
392                    length += 1;
393                    slashes += 1;
394                    last_was_slash = true;
395                }
396                _ => break,
397            }
398        }
399
400        if last_was_slash {
401            length -= 1;
402            slashes -= 1;
403        }
404
405        if slashes > 0 { (TypeTokenKind::QualifiedIdentifier, length) } else { (TypeTokenKind::Identifier, length) }
406    }
407
408    #[inline]
409    fn token(&self, kind: TypeTokenKind, value: &'arena [u8], start: Position, _end: Position) -> TypeToken<'arena> {
410        // SAFETY: `Input` is constructed from a `&str` so the underlying bytes
411        // are valid UTF-8. Token boundaries are either ASCII stop bytes or end
412        // of input, which land on UTF-8 char boundaries.
413        let value_str = unsafe { std::str::from_utf8_unchecked(value) };
414        TypeToken { kind, start, value: value_str }
415    }
416}
417
418impl HasFileId for TypeLexer<'_> {
419    #[inline]
420    fn file_id(&self) -> FileId {
421        self.input.file_id()
422    }
423}