mago_type_syntax/
lexer.rs

1use mago_database::file::FileId;
2use mago_database::file::HasFileId;
3use mago_span::Position;
4use mago_span::Span;
5use mago_syntax_core::float_exponent;
6use mago_syntax_core::float_separator;
7use mago_syntax_core::input::Input;
8use mago_syntax_core::number_sign;
9use mago_syntax_core::part_of_identifier;
10use mago_syntax_core::start_of_binary_number;
11use mago_syntax_core::start_of_float_number;
12use mago_syntax_core::start_of_hexadecimal_number;
13use mago_syntax_core::start_of_identifier;
14use mago_syntax_core::start_of_number;
15use mago_syntax_core::start_of_octal_number;
16use mago_syntax_core::start_of_octal_or_float_number;
17use mago_syntax_core::utils::read_digits_of_base;
18
19use crate::error::SyntaxError;
20use crate::token::TypeToken;
21use crate::token::TypeTokenKind;
22
23#[derive(Debug)]
24pub struct TypeLexer<'input> {
25    input: Input<'input>,
26}
27
28impl<'input> TypeLexer<'input> {
29    #[must_use]
30    pub fn new(input: Input<'input>) -> TypeLexer<'input> {
31        TypeLexer { input }
32    }
33
34    #[must_use]
35    pub fn has_reached_eof(&self) -> bool {
36        self.input.has_reached_eof()
37    }
38
39    #[must_use]
40    pub fn current_position(&self) -> Position {
41        self.input.current_position()
42    }
43
44    /// Returns a string slice within a specified absolute range.
45    ///
46    /// This method exposes the underlying `Input::slice_in_range` functionality but
47    /// returns a `&str` instead of a `&[u8]`. It assumes the source is valid UTF-8.
48    ///
49    /// # Arguments
50    ///
51    /// * `from` - The absolute starting byte offset.
52    /// * `to` - The absolute ending byte offset (exclusive).
53    #[inline]
54    #[must_use]
55    pub fn slice_in_range(&self, from: u32, to: u32) -> &'input str {
56        let bytes_slice = self.input.slice_in_range(from, to);
57
58        // Reuse the same safe UTF-8 conversion logic as the `token` method.
59        bytes_slice.utf8_chunks().next().map_or("", |chunk| chunk.valid())
60    }
61
62    #[inline]
63    pub fn advance(&mut self) -> Option<Result<TypeToken<'input>, SyntaxError>> {
64        if self.input.has_reached_eof() {
65            return None;
66        }
67
68        let start = self.input.current_position();
69        let whitespaces = self.input.consume_whitespaces();
70        if !whitespaces.is_empty() {
71            let end = self.input.current_position();
72
73            return Some(Ok(self.token(TypeTokenKind::Whitespace, whitespaces, start, end)));
74        }
75
76        let (kind, length) = match self.input.read(3) {
77            [b'*', ..] => (TypeTokenKind::Asterisk, 1),
78            [b'n' | b'N', b'o' | b'O', b'n' | b'N'] => {
79                if self.input.is_at(b"non-positive-int", true) {
80                    (TypeTokenKind::NonPositiveInt, 16)
81                } else if self.input.is_at(b"non-negative-int", true) {
82                    (TypeTokenKind::NonNegativeInt, 16)
83                } else if self.input.is_at(b"non-empty-literal-string", true) {
84                    (TypeTokenKind::NonEmptyUnspecifiedLiteralString, 26)
85                } else if self.input.is_at(b"non-empty-string", true) {
86                    (TypeTokenKind::NonEmptyString, 16)
87                } else if self.input.is_at(b"non-empty-array", true) {
88                    (TypeTokenKind::NonEmptyArray, 15)
89                } else if self.input.is_at(b"non-empty-list", true) {
90                    (TypeTokenKind::NonEmptyList, 14)
91                } else if self.input.is_at(b"non-falsy-string", true) {
92                    (TypeTokenKind::NonFalsyString, 16)
93                } else if self.input.is_at(b"non-empty-lowercase-string", true) {
94                    (TypeTokenKind::NonEmptyLowercaseString, 26)
95                } else {
96                    self.read_identifier()
97                }
98            }
99            [b'p' | b'P', b'u' | b'U', b'r' | b'R'] => {
100                if self.input.is_at(b"pure-closure", true) {
101                    (TypeTokenKind::PureClosure, 12)
102                } else if self.input.is_at(b"pure-callable", true) {
103                    (TypeTokenKind::PureCallable, 13)
104                } else {
105                    self.read_identifier()
106                }
107            }
108            [b'n' | b'N', b'e' | b'E', b'v' | b'V'] => {
109                if self.input.is_at(b"never-return", true) {
110                    (TypeTokenKind::NeverReturn, 12)
111                } else if self.input.is_at(b"never-returns", true) {
112                    (TypeTokenKind::NeverReturns, 13)
113                } else {
114                    self.read_identifier()
115                }
116            }
117            [b't' | b'T', b'r' | b'R', b'u' | b'U'] => {
118                if self.input.is_at(b"truthy-string", true) {
119                    (TypeTokenKind::TruthyString, 13)
120                } else {
121                    self.read_identifier()
122                }
123            }
124            [b't' | b'T', b'r' | b'R', b'a' | b'A'] => {
125                if self.input.is_at(b"trait-string", true) {
126                    (TypeTokenKind::TraitString, 12)
127                } else {
128                    self.read_identifier()
129                }
130            }
131            [b'a' | b'A', b's' | b'S', b's' | b'S'] => {
132                if self.input.is_at(b"associative-array", true) {
133                    (TypeTokenKind::AssociativeArray, 17)
134                } else {
135                    self.read_identifier()
136                }
137            }
138            [b'c' | b'C', b'l' | b'L', b'a' | b'A'] => {
139                if self.input.is_at(b"class-string", true) {
140                    (TypeTokenKind::ClassString, 12)
141                } else {
142                    self.read_identifier()
143                }
144            }
145            [b'e' | b'E', b'n' | b'N', b'u' | b'U'] => {
146                if self.input.is_at(b"enum-string", true) {
147                    (TypeTokenKind::EnumString, 11)
148                } else {
149                    self.read_identifier()
150                }
151            }
152            [b'i' | b'I', b'n' | b'N', b't' | b'T'] => {
153                if self.input.is_at(b"interface-string", true) {
154                    (TypeTokenKind::InterfaceString, 16)
155                } else if self.input.is_at(b"int-mask-of", true) {
156                    (TypeTokenKind::IntMaskOf, 11)
157                } else if self.input.is_at(b"int-mask", true) {
158                    (TypeTokenKind::IntMask, 8)
159                } else {
160                    self.read_identifier()
161                }
162            }
163            [b'c' | b'C', b'l' | b'L', b'o' | b'O'] => {
164                if self.input.is_at(b"closed-resource", true) {
165                    (TypeTokenKind::ClosedResource, 15)
166                } else {
167                    self.read_identifier()
168                }
169            }
170            [b's' | b'S', b't' | b'T', b'r' | b'R'] => {
171                if self.input.is_at(b"stringable-object", true) {
172                    (TypeTokenKind::StringableObject, 17)
173                } else {
174                    self.read_identifier()
175                }
176            }
177            [b'n' | b'N', b'u' | b'U', b'm' | b'M'] => {
178                if self.input.is_at(b"numeric-string", true) {
179                    (TypeTokenKind::NumericString, 14)
180                } else {
181                    self.read_identifier()
182                }
183            }
184            [b'l' | b'L', b'i' | b'I', b't' | b'T'] => {
185                if self.input.is_at(b"literal-string", true) {
186                    (TypeTokenKind::UnspecifiedLiteralString, 14)
187                } else if self.input.is_at(b"literal-int", true) {
188                    (TypeTokenKind::UnspecifiedLiteralInt, 11)
189                } else if self.input.is_at(b"literal-float", true) {
190                    (TypeTokenKind::UnspecifiedLiteralFloat, 13)
191                } else {
192                    self.read_identifier()
193                }
194            }
195            [b'l' | b'L', b'o' | b'O', b'w' | b'W'] => {
196                if self.input.is_at(b"lowercase-string", true) {
197                    (TypeTokenKind::LowercaseString, 16)
198                } else {
199                    self.read_identifier()
200                }
201            }
202            [b'o' | b'O', b'p' | b'P', b'e' | b'E'] => {
203                if self.input.is_at(b"open-resource", true) {
204                    (TypeTokenKind::OpenResource, 13)
205                } else {
206                    self.read_identifier()
207                }
208            }
209            [b'a' | b'A', b'r' | b'R', b'r' | b'R'] => {
210                if self.input.is_at(b"array-key", true) {
211                    (TypeTokenKind::ArrayKey, 9)
212                } else {
213                    self.read_identifier()
214                }
215            }
216            [b'n' | b'N', b'o' | b'O', b'-'] => {
217                if self.input.is_at(b"no-return", true) {
218                    (TypeTokenKind::NoReturn, 9)
219                } else {
220                    self.read_identifier()
221                }
222            }
223            [b'v' | b'V', b'a' | b'A', b'l' | b'L'] => {
224                if self.input.is_at(b"value-of", true) {
225                    (TypeTokenKind::ValueOf, 8)
226                } else {
227                    self.read_identifier()
228                }
229            }
230            [b'k' | b'K', b'e' | b'E', b'y' | b'Y'] => {
231                if self.input.is_at(b"key-of", true) {
232                    (TypeTokenKind::KeyOf, 6)
233                } else {
234                    self.read_identifier()
235                }
236            }
237            [b'p' | b'P', b'r' | b'R', b'o' | b'O'] => {
238                if self.input.is_at(b"protected-properties-of", true) {
239                    (TypeTokenKind::ProtectedPropertiesOf, 23)
240                } else if self.input.is_at(b"properties-of", true) {
241                    (TypeTokenKind::PropertiesOf, 13)
242                } else {
243                    self.read_identifier()
244                }
245            }
246            [b'p' | b'P', b'u' | b'U', b'b' | b'B'] => {
247                if self.input.is_at(b"public-properties-of", true) {
248                    (TypeTokenKind::PublicPropertiesOf, 20)
249                } else {
250                    self.read_identifier()
251                }
252            }
253            [b'p' | b'P', b'r' | b'R', b'i' | b'I'] => {
254                if self.input.is_at(b"private-properties-of", true) {
255                    (TypeTokenKind::PrivatePropertiesOf, 21)
256                } else {
257                    self.read_identifier()
258                }
259            }
260            [b'p' | b'P', b'o' | b'O', b's' | b'S'] => {
261                if self.input.is_at(b"positive-int", true) {
262                    (TypeTokenKind::PositiveInt, 12)
263                } else {
264                    self.read_identifier()
265                }
266            }
267            [b'n' | b'N', b'e' | b'E', b'g' | b'G'] => {
268                if self.input.is_at(b"negative-int", true) {
269                    (TypeTokenKind::NegativeInt, 12)
270                } else {
271                    self.read_identifier()
272                }
273            }
274            [b'.', b'.', b'.'] => (TypeTokenKind::Ellipsis, 3),
275            [b':', b':', ..] => (TypeTokenKind::ColonColon, 2),
276            [b'/', b'/', ..] => self.read_single_line_comment(),
277            [b'.', start_of_number!(), ..] => self.read_decimal(),
278            [start_of_number!(), ..] => self.read_number(),
279            [quote @ (b'\'' | b'"'), ..] => self.read_literal_string(*quote),
280            [b'\\', start_of_identifier!(), ..] => self.read_fully_qualified_identifier(),
281            [start_of_identifier!(), ..] => self.read_identifier(),
282            [b'$', start_of_identifier!(), ..] => {
283                let mut length = 2;
284                while let [part_of_identifier!(), ..] = self.input.peek(length, 1) {
285                    length += 1;
286                }
287
288                (TypeTokenKind::Variable, length)
289            }
290            [b':', ..] => (TypeTokenKind::Colon, 1),
291            [b'=', ..] => (TypeTokenKind::Equals, 1),
292            [b'?', ..] => (TypeTokenKind::Question, 1),
293            [b'!', ..] => (TypeTokenKind::Exclamation, 1),
294            [b'&', ..] => (TypeTokenKind::Ampersand, 1),
295            [b'|', ..] => (TypeTokenKind::Pipe, 1),
296            [b'>', ..] => (TypeTokenKind::GreaterThan, 1),
297            [b'<', ..] => (TypeTokenKind::LessThan, 1),
298            [b'(', ..] => (TypeTokenKind::LeftParenthesis, 1),
299            [b')', ..] => (TypeTokenKind::RightParenthesis, 1),
300            [b'[', ..] => (TypeTokenKind::LeftBracket, 1),
301            [b']', ..] => (TypeTokenKind::RightBracket, 1),
302            [b'{', ..] => (TypeTokenKind::LeftBrace, 1),
303            [b'}', ..] => (TypeTokenKind::RightBrace, 1),
304            [b',', ..] => (TypeTokenKind::Comma, 1),
305            [b'+', ..] => (TypeTokenKind::Plus, 1),
306            [b'-', ..] => (TypeTokenKind::Minus, 1),
307            [unknown_byte, ..] => {
308                return Some(Err(SyntaxError::UnrecognizedToken(
309                    self.file_id(),
310                    *unknown_byte,
311                    self.input.current_position(),
312                )));
313            }
314            [] => {
315                unreachable!()
316            }
317        };
318
319        let buffer = self.input.consume(length);
320        let end = self.input.current_position();
321
322        Some(Ok(self.token(kind, buffer, start, end)))
323    }
324
325    fn read_single_line_comment(&self) -> (TypeTokenKind, usize) {
326        let mut length = 2;
327        loop {
328            match self.input.peek(length, 1) {
329                [b'\n', ..] | [] => {
330                    break;
331                }
332                [_, ..] => {
333                    length += 1;
334                }
335            }
336        }
337
338        (TypeTokenKind::SingleLineComment, length)
339    }
340
341    fn read_decimal(&self) -> (TypeTokenKind, usize) {
342        let mut length = read_digits_of_base(&self.input, 2, 10);
343        if let float_exponent!() = self.input.peek(length, 1) {
344            length += 1;
345            if let number_sign!() = self.input.peek(length, 1) {
346                length += 1;
347            }
348
349            length = read_digits_of_base(&self.input, length, 10);
350        }
351
352        (TypeTokenKind::LiteralFloat, length)
353    }
354
355    fn read_number(&self) -> (TypeTokenKind, usize) {
356        #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
357        pub enum NumberKind {
358            Integer,
359            Float,
360            OctalOrFloat,
361            IntegerOrFloat,
362        }
363
364        let mut length = 1;
365
366        let (base, kind): (u8, NumberKind) = match self.input.read(3) {
367            start_of_binary_number!() => {
368                length += 1;
369
370                (2, NumberKind::Integer)
371            }
372            start_of_octal_number!() => {
373                length += 1;
374
375                (8, NumberKind::Integer)
376            }
377            start_of_hexadecimal_number!() => {
378                length += 1;
379
380                (16, NumberKind::Integer)
381            }
382            start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
383            start_of_float_number!() => (10, NumberKind::Float),
384            _ => (10, NumberKind::IntegerOrFloat),
385        };
386
387        if kind != NumberKind::Float {
388            length = read_digits_of_base(&self.input, length, base);
389
390            if kind == NumberKind::Integer {
391                return (TypeTokenKind::LiteralInteger, length);
392            }
393        }
394
395        let is_float = matches!(self.input.peek(length, 3), float_separator!());
396
397        if !is_float {
398            return (TypeTokenKind::LiteralInteger, length);
399        }
400
401        if let [b'.'] = self.input.peek(length, 1) {
402            length += 1;
403            length = read_digits_of_base(&self.input, length, 10);
404        }
405
406        if let float_exponent!() = self.input.peek(length, 1) {
407            length += 1;
408            if let number_sign!() = self.input.peek(length, 1) {
409                length += 1;
410            }
411
412            length = read_digits_of_base(&self.input, length, 10);
413        }
414
415        (TypeTokenKind::LiteralFloat, length)
416    }
417
418    fn read_literal_string(&self, quote: u8) -> (TypeTokenKind, usize) {
419        let total = self.input.len();
420        let start = self.input.current_offset();
421        let mut length = 1; // We assume the opening quote is already consumed.
422        let mut last_was_backslash = false;
423        let mut partial = false;
424
425        loop {
426            let pos = start + length;
427            if pos >= total {
428                // Reached EOF before closing quote.
429                partial = true;
430                break;
431            }
432
433            let byte = self.input.read_at(pos);
434            if matches!(byte, b'\\') {
435                // Toggle the backslash flag.
436                last_was_backslash = !last_was_backslash;
437                length += 1;
438            } else {
439                // If we see the closing quote and the previous byte was not an escape.
440                if byte == &quote && !last_was_backslash {
441                    length += 1; // Include the closing quote.
442                    break;
443                }
444
445                length += 1;
446                last_was_backslash = false;
447            }
448        }
449
450        if partial { (TypeTokenKind::PartialLiteralString, length) } else { (TypeTokenKind::LiteralString, length) }
451    }
452
453    fn read_fully_qualified_identifier(&self) -> (TypeTokenKind, usize) {
454        let mut length = 2;
455        let mut last_was_slash = false;
456        loop {
457            match self.input.peek(length, 1) {
458                [start_of_identifier!(), ..] if last_was_slash => {
459                    length += 1;
460                    last_was_slash = false;
461                }
462                [part_of_identifier!(), ..] if !last_was_slash => {
463                    length += 1;
464                }
465                [b'\\', ..] => {
466                    if last_was_slash {
467                        length -= 1;
468
469                        break;
470                    }
471
472                    length += 1;
473                    last_was_slash = true;
474                }
475                _ => {
476                    break;
477                }
478            }
479        }
480
481        (TypeTokenKind::FullyQualifiedIdentifier, length)
482    }
483
484    fn read_identifier(&self) -> (TypeTokenKind, usize) {
485        const KEYWORD_TYPES: [(&[u8], TypeTokenKind); 28] = [
486            (b"list", TypeTokenKind::List),
487            (b"int", TypeTokenKind::Int),
488            (b"integer", TypeTokenKind::Integer),
489            (b"string", TypeTokenKind::String),
490            (b"float", TypeTokenKind::Float),
491            (b"double", TypeTokenKind::Double),
492            (b"real", TypeTokenKind::Real),
493            (b"bool", TypeTokenKind::Bool),
494            (b"boolean", TypeTokenKind::Boolean),
495            (b"false", TypeTokenKind::False),
496            (b"true", TypeTokenKind::True),
497            (b"object", TypeTokenKind::Object),
498            (b"callable", TypeTokenKind::Callable),
499            (b"array", TypeTokenKind::Array),
500            (b"iterable", TypeTokenKind::Iterable),
501            (b"null", TypeTokenKind::Null),
502            (b"mixed", TypeTokenKind::Mixed),
503            (b"resource", TypeTokenKind::Resource),
504            (b"void", TypeTokenKind::Void),
505            (b"scalar", TypeTokenKind::Scalar),
506            (b"numeric", TypeTokenKind::Numeric),
507            (b"never", TypeTokenKind::Never),
508            (b"nothing", TypeTokenKind::Nothing),
509            (b"as", TypeTokenKind::As),
510            (b"is", TypeTokenKind::Is),
511            (b"not", TypeTokenKind::Not),
512            (b"min", TypeTokenKind::Min),
513            (b"max", TypeTokenKind::Max),
514        ];
515
516        let mut length = 1;
517        let mut ended_with_slash = false;
518        loop {
519            match self.input.peek(length, 2) {
520                [part_of_identifier!(), ..] => {
521                    length += 1;
522                }
523                [b'\\', start_of_identifier!(), ..] => {
524                    ended_with_slash = true;
525                    break;
526                }
527                _ => {
528                    break;
529                }
530            }
531        }
532
533        if !ended_with_slash {
534            for (value, kind) in KEYWORD_TYPES {
535                let keyword_length = value.len();
536                if keyword_length != length {
537                    continue;
538                }
539
540                if self.input.is_at(value, true) {
541                    return (kind, keyword_length);
542                }
543            }
544        }
545
546        let mut slashes = 0;
547        let mut last_was_slash = false;
548        loop {
549            match self.input.peek(length, 1) {
550                [start_of_identifier!(), ..] if last_was_slash => {
551                    length += 1;
552                    last_was_slash = false;
553                }
554                [part_of_identifier!(), ..] if !last_was_slash => {
555                    length += 1;
556                }
557                [b'\\', ..] => {
558                    if last_was_slash {
559                        length -= 1;
560                        slashes -= 1;
561                        last_was_slash = false;
562
563                        break;
564                    }
565                    length += 1;
566                    slashes += 1;
567                    last_was_slash = true;
568                }
569                _ => {
570                    break;
571                }
572            }
573        }
574
575        if last_was_slash {
576            length -= 1;
577            slashes -= 1;
578        }
579
580        if slashes > 0 { (TypeTokenKind::QualifiedIdentifier, length) } else { (TypeTokenKind::Identifier, length) }
581    }
582
583    #[inline]
584    fn token(&self, kind: TypeTokenKind, value: &'input [u8], from: Position, to: Position) -> TypeToken<'input> {
585        let mut value_chunks = value.utf8_chunks();
586        let value_str = if let Some(chunk) = value_chunks.next() {
587            let valid = chunk.valid();
588
589            debug_assert_eq!(valid.len(), value.len());
590
591            valid
592        } else {
593            ""
594        };
595
596        TypeToken { kind, value: value_str, span: Span::new(self.file_id(), from, to) }
597    }
598}
599
600impl HasFileId for TypeLexer<'_> {
601    fn file_id(&self) -> FileId {
602        self.input.file_id()
603    }
604}