Skip to main content

mago_type_syntax/lexer/
mod.rs

1mod keyword;
2
3use mago_database::file::FileId;
4use mago_database::file::HasFileId;
5use mago_span::Position;
6use mago_syntax_core::float_exponent;
7use mago_syntax_core::float_separator;
8use mago_syntax_core::input::Input;
9use mago_syntax_core::number_sign;
10use mago_syntax_core::part_of_identifier;
11use mago_syntax_core::start_of_binary_number;
12use mago_syntax_core::start_of_float_number;
13use mago_syntax_core::start_of_hexadecimal_number;
14use mago_syntax_core::start_of_identifier;
15use mago_syntax_core::start_of_number;
16use mago_syntax_core::start_of_octal_number;
17use mago_syntax_core::start_of_octal_or_float_number;
18use mago_syntax_core::utils::read_digits_of_base;
19
20use crate::error::SyntaxError;
21use crate::token::TypeToken;
22use crate::token::TypeTokenKind;
23
24#[derive(Debug)]
25pub struct TypeLexer<'input> {
26    input: Input<'input>,
27}
28
29impl<'input> TypeLexer<'input> {
30    #[inline]
31    #[must_use]
32    pub fn new(input: Input<'input>) -> TypeLexer<'input> {
33        TypeLexer { input }
34    }
35
36    #[inline]
37    #[must_use]
38    pub fn has_reached_eof(&self) -> bool {
39        self.input.has_reached_eof()
40    }
41
42    #[inline]
43    #[must_use]
44    pub fn current_position(&self) -> Position {
45        self.input.current_position()
46    }
47
48    #[inline]
49    #[must_use]
50    pub fn slice_in_range(&self, from: u32, to: u32) -> &'input str {
51        let bytes_slice = self.input.slice_in_range(from, to);
52        bytes_slice.utf8_chunks().next().map_or("", |chunk| chunk.valid())
53    }
54
55    #[inline]
56    pub fn advance(&mut self) -> Option<Result<TypeToken<'input>, SyntaxError>> {
57        if self.input.has_reached_eof() {
58            return None;
59        }
60
61        let start = self.input.current_position();
62        let whitespaces = self.input.consume_whitespaces();
63        if !whitespaces.is_empty() {
64            let end = self.input.current_position();
65            return Some(Ok(self.token(TypeTokenKind::Whitespace, whitespaces, start, end)));
66        }
67
68        let (kind, length) = match self.input.read(3) {
69            [b'*', ..] => (TypeTokenKind::Asterisk, 1),
70            [b'.', b'.', b'.'] => (TypeTokenKind::Ellipsis, 3),
71            [b':', b':', ..] => (TypeTokenKind::ColonColon, 2),
72            [b'/', b'/', ..] => self.read_single_line_comment(),
73            [b'.', start_of_number!(), ..] => self.read_decimal(),
74            [start_of_number!(), ..] => self.read_number(),
75            [quote @ (b'\'' | b'"'), ..] => self.read_literal_string(*quote),
76            [b'\\', start_of_identifier!(), ..] => self.read_fully_qualified_identifier(),
77            [start_of_identifier!(), ..] => self.read_identifier_or_keyword(),
78            [b'$', start_of_identifier!(), ..] => self.read_variable(),
79            [b':', ..] => (TypeTokenKind::Colon, 1),
80            [b'=', ..] => (TypeTokenKind::Equals, 1),
81            [b'?', ..] => (TypeTokenKind::Question, 1),
82            [b'!', ..] => (TypeTokenKind::Exclamation, 1),
83            [b'&', ..] => (TypeTokenKind::Ampersand, 1),
84            [b'|', ..] => (TypeTokenKind::Pipe, 1),
85            [b'>', ..] => (TypeTokenKind::GreaterThan, 1),
86            [b'<', ..] => (TypeTokenKind::LessThan, 1),
87            [b'(', ..] => (TypeTokenKind::LeftParenthesis, 1),
88            [b')', ..] => (TypeTokenKind::RightParenthesis, 1),
89            [b'[', ..] => (TypeTokenKind::LeftBracket, 1),
90            [b']', ..] => (TypeTokenKind::RightBracket, 1),
91            [b'{', ..] => (TypeTokenKind::LeftBrace, 1),
92            [b'}', ..] => (TypeTokenKind::RightBrace, 1),
93            [b',', ..] => (TypeTokenKind::Comma, 1),
94            [b'+', ..] => (TypeTokenKind::Plus, 1),
95            [b'-', ..] => (TypeTokenKind::Minus, 1),
96            [unknown_byte, ..] => {
97                return Some(Err(SyntaxError::UnrecognizedToken(
98                    self.file_id(),
99                    *unknown_byte,
100                    self.input.current_position(),
101                )));
102            }
103            [] => unreachable!(),
104        };
105
106        let buffer = self.input.consume(length);
107        let end = self.input.current_position();
108
109        Some(Ok(self.token(kind, buffer, start, end)))
110    }
111
112    #[inline]
113    fn read_variable(&self) -> (TypeTokenKind, usize) {
114        let mut length = 2;
115        while let [part_of_identifier!(), ..] = self.input.peek(length, 1) {
116            length += 1;
117        }
118        (TypeTokenKind::Variable, length)
119    }
120
121    #[inline]
122    fn read_single_line_comment(&self) -> (TypeTokenKind, usize) {
123        let mut length = 2;
124        loop {
125            match self.input.peek(length, 1) {
126                [b'\n', ..] | [] => break,
127                [_, ..] => length += 1,
128            }
129        }
130        (TypeTokenKind::SingleLineComment, length)
131    }
132
133    #[inline]
134    fn read_decimal(&self) -> (TypeTokenKind, usize) {
135        let mut length = read_digits_of_base(&self.input, 2, 10);
136        if let float_exponent!() = self.input.peek(length, 1) {
137            length += 1;
138            if let number_sign!() = self.input.peek(length, 1) {
139                length += 1;
140            }
141            length = read_digits_of_base(&self.input, length, 10);
142        }
143        (TypeTokenKind::LiteralFloat, length)
144    }
145
146    #[inline]
147    fn read_number(&self) -> (TypeTokenKind, usize) {
148        #[derive(Debug, Clone, Copy, PartialEq, Eq)]
149        enum NumberKind {
150            Integer,
151            Float,
152            OctalOrFloat,
153            IntegerOrFloat,
154        }
155
156        let mut length = 1;
157        let (base, kind): (u8, NumberKind) = match self.input.read(3) {
158            start_of_binary_number!() => {
159                length += 1;
160                (2, NumberKind::Integer)
161            }
162            start_of_octal_number!() => {
163                length += 1;
164                (8, NumberKind::Integer)
165            }
166            start_of_hexadecimal_number!() => {
167                length += 1;
168                (16, NumberKind::Integer)
169            }
170            start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
171            start_of_float_number!() => (10, NumberKind::Float),
172            _ => (10, NumberKind::IntegerOrFloat),
173        };
174
175        if kind != NumberKind::Float {
176            length = read_digits_of_base(&self.input, length, base);
177            if kind == NumberKind::Integer {
178                return (TypeTokenKind::LiteralInteger, length);
179            }
180        }
181
182        let is_float = matches!(self.input.peek(length, 3), float_separator!());
183        if !is_float {
184            return (TypeTokenKind::LiteralInteger, length);
185        }
186
187        if let [b'.'] = self.input.peek(length, 1) {
188            length += 1;
189            length = read_digits_of_base(&self.input, length, 10);
190        }
191
192        if let float_exponent!() = self.input.peek(length, 1) {
193            length += 1;
194            if let number_sign!() = self.input.peek(length, 1) {
195                length += 1;
196            }
197            length = read_digits_of_base(&self.input, length, 10);
198        }
199
200        (TypeTokenKind::LiteralFloat, length)
201    }
202
203    #[inline]
204    fn read_literal_string(&self, quote: u8) -> (TypeTokenKind, usize) {
205        let total = self.input.len();
206        let start = self.input.current_offset();
207        let mut length = 1;
208        let mut last_was_backslash = false;
209        let mut partial = false;
210
211        loop {
212            let pos = start + length;
213            if pos >= total {
214                partial = true;
215                break;
216            }
217
218            let byte = self.input.read_at(pos);
219            if *byte == b'\\' {
220                last_was_backslash = !last_was_backslash;
221                length += 1;
222            } else {
223                if byte == &quote && !last_was_backslash {
224                    length += 1;
225                    break;
226                }
227                length += 1;
228                last_was_backslash = false;
229            }
230        }
231
232        if partial { (TypeTokenKind::PartialLiteralString, length) } else { (TypeTokenKind::LiteralString, length) }
233    }
234
235    #[inline]
236    fn read_fully_qualified_identifier(&self) -> (TypeTokenKind, usize) {
237        let mut length = 2;
238        let mut last_was_slash = false;
239        loop {
240            match self.input.peek(length, 1) {
241                [start_of_identifier!(), ..] if last_was_slash => {
242                    length += 1;
243                    last_was_slash = false;
244                }
245                [part_of_identifier!(), ..] if !last_was_slash => {
246                    length += 1;
247                }
248                [b'\\', ..] => {
249                    if last_was_slash {
250                        length -= 1;
251                        break;
252                    }
253                    length += 1;
254                    last_was_slash = true;
255                }
256                _ => break,
257            }
258        }
259        (TypeTokenKind::FullyQualifiedIdentifier, length)
260    }
261
262    /// Read an identifier or keyword (including compound keywords with hyphens).
263    /// This is the hot path - optimized for common case (simple identifiers).
264    #[inline]
265    fn read_identifier_or_keyword(&self) -> (TypeTokenKind, usize) {
266        let mut length = 1;
267        let mut next_is_hyphen = false;
268        let mut next_is_backslash = false;
269
270        loop {
271            match self.input.peek(length, 2) {
272                [part_of_identifier!(), ..] => length += 1,
273                [b'-', start_of_identifier!() | part_of_identifier!(), ..] => {
274                    next_is_hyphen = true;
275                    break;
276                }
277                [b'\\', start_of_identifier!(), ..] => {
278                    next_is_backslash = true;
279                    break;
280                }
281                _ => break,
282            }
283        }
284
285        if next_is_backslash {
286            return self.finish_qualified_identifier(length);
287        }
288
289        if !next_is_hyphen {
290            let bytes = self.input.read(length);
291            if let Some(kind) = keyword::lookup_keyword(bytes) {
292                return (kind, length);
293            }
294            return (TypeTokenKind::Identifier, length);
295        }
296
297        let base_len = length;
298        loop {
299            match self.input.peek(length, 2) {
300                [part_of_identifier!(), ..] => length += 1,
301                [b'-', start_of_identifier!() | part_of_identifier!(), ..] => length += 1,
302                _ => break,
303            }
304        }
305
306        let bytes = self.input.read(length);
307        if let Some(kind) = keyword::lookup_keyword(bytes) {
308            return (kind, length);
309        }
310
311        let base_bytes = self.input.read(base_len);
312        if let Some(kind) = keyword::lookup_keyword(base_bytes) {
313            return (kind, base_len);
314        }
315
316        (TypeTokenKind::Identifier, base_len)
317    }
318
319    /// Continue reading a qualified identifier (with backslashes).
320    #[inline]
321    fn finish_qualified_identifier(&self, start_len: usize) -> (TypeTokenKind, usize) {
322        let mut length = start_len;
323        let mut slashes = 0;
324        let mut last_was_slash = false;
325
326        loop {
327            match self.input.peek(length, 1) {
328                [start_of_identifier!(), ..] if last_was_slash => {
329                    length += 1;
330                    last_was_slash = false;
331                }
332                [part_of_identifier!(), ..] if !last_was_slash => {
333                    length += 1;
334                }
335                [b'\\', ..] => {
336                    if last_was_slash {
337                        length -= 1;
338                        slashes -= 1;
339                        break;
340                    }
341                    length += 1;
342                    slashes += 1;
343                    last_was_slash = true;
344                }
345                _ => break,
346            }
347        }
348
349        if last_was_slash {
350            length -= 1;
351            slashes -= 1;
352        }
353
354        if slashes > 0 { (TypeTokenKind::QualifiedIdentifier, length) } else { (TypeTokenKind::Identifier, length) }
355    }
356
357    #[inline]
358    fn token(&self, kind: TypeTokenKind, value: &'input [u8], start: Position, _end: Position) -> TypeToken<'input> {
359        let value_str = value.utf8_chunks().next().map_or("", |chunk| chunk.valid());
360        debug_assert_eq!(value_str.len(), value.len());
361        TypeToken { kind, start, value: value_str }
362    }
363}
364
365impl HasFileId for TypeLexer<'_> {
366    #[inline]
367    fn file_id(&self) -> FileId {
368        self.input.file_id()
369    }
370}