Skip to main content

mago_type_syntax/
lexer.rs

1use mago_database::file::FileId;
2use mago_database::file::HasFileId;
3use mago_span::Position;
4use mago_span::Span;
5use mago_syntax_core::float_exponent;
6use mago_syntax_core::float_separator;
7use mago_syntax_core::input::Input;
8use mago_syntax_core::number_sign;
9use mago_syntax_core::part_of_identifier;
10use mago_syntax_core::start_of_binary_number;
11use mago_syntax_core::start_of_float_number;
12use mago_syntax_core::start_of_hexadecimal_number;
13use mago_syntax_core::start_of_identifier;
14use mago_syntax_core::start_of_number;
15use mago_syntax_core::start_of_octal_number;
16use mago_syntax_core::start_of_octal_or_float_number;
17use mago_syntax_core::utils::read_digits_of_base;
18
19use crate::error::SyntaxError;
20use crate::token::TypeToken;
21use crate::token::TypeTokenKind;
22
23#[derive(Debug)]
24pub struct TypeLexer<'input> {
25    input: Input<'input>,
26}
27
28impl<'input> TypeLexer<'input> {
29    #[must_use]
30    pub fn new(input: Input<'input>) -> TypeLexer<'input> {
31        TypeLexer { input }
32    }
33
34    #[must_use]
35    pub fn has_reached_eof(&self) -> bool {
36        self.input.has_reached_eof()
37    }
38
39    #[must_use]
40    pub fn current_position(&self) -> Position {
41        self.input.current_position()
42    }
43
44    /// Returns a string slice within a specified absolute range.
45    ///
46    /// This method exposes the underlying `Input::slice_in_range` functionality but
47    /// returns a `&str` instead of a `&[u8]`. It assumes the source is valid UTF-8.
48    ///
49    /// # Arguments
50    ///
51    /// * `from` - The absolute starting byte offset.
52    /// * `to` - The absolute ending byte offset (exclusive).
53    #[inline]
54    #[must_use]
55    pub fn slice_in_range(&self, from: u32, to: u32) -> &'input str {
56        let bytes_slice = self.input.slice_in_range(from, to);
57
58        // Reuse the same safe UTF-8 conversion logic as the `token` method.
59        bytes_slice.utf8_chunks().next().map_or("", |chunk| chunk.valid())
60    }
61
62    #[inline]
63    pub fn advance(&mut self) -> Option<Result<TypeToken<'input>, SyntaxError>> {
64        if self.input.has_reached_eof() {
65            return None;
66        }
67
68        let start = self.input.current_position();
69        let whitespaces = self.input.consume_whitespaces();
70        if !whitespaces.is_empty() {
71            let end = self.input.current_position();
72
73            return Some(Ok(self.token(TypeTokenKind::Whitespace, whitespaces, start, end)));
74        }
75
76        let (kind, length) = match self.input.read(3) {
77            [b'*', ..] => (TypeTokenKind::Asterisk, 1),
78            [b'n' | b'N', b'o' | b'O', b'n' | b'N'] => {
79                if self.input.is_at(b"non-positive-int", true) {
80                    (TypeTokenKind::NonPositiveInt, 16)
81                } else if self.input.is_at(b"non-negative-int", true) {
82                    (TypeTokenKind::NonNegativeInt, 16)
83                } else if self.input.is_at(b"non-empty-literal-string", true) {
84                    (TypeTokenKind::NonEmptyUnspecifiedLiteralString, 26)
85                } else if self.input.is_at(b"non-empty-string", true) {
86                    (TypeTokenKind::NonEmptyString, 16)
87                } else if self.input.is_at(b"non-empty-array", true) {
88                    (TypeTokenKind::NonEmptyArray, 15)
89                } else if self.input.is_at(b"non-empty-list", true) {
90                    (TypeTokenKind::NonEmptyList, 14)
91                } else if self.input.is_at(b"non-falsy-string", true) {
92                    (TypeTokenKind::NonFalsyString, 16)
93                } else if self.input.is_at(b"non-empty-lowercase-string", true) {
94                    (TypeTokenKind::NonEmptyLowercaseString, 26)
95                } else if self.input.is_at(b"non-empty-mixed", true) {
96                    (TypeTokenKind::NonEmptyMixed, 15)
97                } else {
98                    self.read_identifier()
99                }
100            }
101            [b'p' | b'P', b'u' | b'U', b'r' | b'R'] => {
102                if self.input.is_at(b"pure-closure", true) {
103                    (TypeTokenKind::PureClosure, 12)
104                } else if self.input.is_at(b"pure-callable", true) {
105                    (TypeTokenKind::PureCallable, 13)
106                } else {
107                    self.read_identifier()
108                }
109            }
110            [b'n' | b'N', b'e' | b'E', b'v' | b'V'] => {
111                if self.input.is_at(b"never-return", true) {
112                    (TypeTokenKind::NeverReturn, 12)
113                } else if self.input.is_at(b"never-returns", true) {
114                    (TypeTokenKind::NeverReturns, 13)
115                } else {
116                    self.read_identifier()
117                }
118            }
119            [b't' | b'T', b'r' | b'R', b'u' | b'U'] => {
120                if self.input.is_at(b"truthy-string", true) {
121                    (TypeTokenKind::TruthyString, 13)
122                } else {
123                    self.read_identifier()
124                }
125            }
126            [b't' | b'T', b'r' | b'R', b'a' | b'A'] => {
127                if self.input.is_at(b"trait-string", true) {
128                    (TypeTokenKind::TraitString, 12)
129                } else {
130                    self.read_identifier()
131                }
132            }
133            [b'a' | b'A', b's' | b'S', b's' | b'S'] => {
134                if self.input.is_at(b"associative-array", true) {
135                    (TypeTokenKind::AssociativeArray, 17)
136                } else {
137                    self.read_identifier()
138                }
139            }
140            [b'c' | b'C', b'l' | b'L', b'a' | b'A'] => {
141                if self.input.is_at(b"class-string", true) {
142                    (TypeTokenKind::ClassString, 12)
143                } else {
144                    self.read_identifier()
145                }
146            }
147            [b'e' | b'E', b'n' | b'N', b'u' | b'U'] => {
148                if self.input.is_at(b"enum-string", true) {
149                    (TypeTokenKind::EnumString, 11)
150                } else {
151                    self.read_identifier()
152                }
153            }
154            [b'i' | b'I', b'n' | b'N', b't' | b'T'] => {
155                if self.input.is_at(b"interface-string", true) {
156                    (TypeTokenKind::InterfaceString, 16)
157                } else if self.input.is_at(b"int-mask-of", true) {
158                    (TypeTokenKind::IntMaskOf, 11)
159                } else if self.input.is_at(b"int-mask", true) {
160                    (TypeTokenKind::IntMask, 8)
161                } else {
162                    self.read_identifier()
163                }
164            }
165            [b'c' | b'C', b'l' | b'L', b'o' | b'O'] => {
166                if self.input.is_at(b"closed-resource", true) {
167                    (TypeTokenKind::ClosedResource, 15)
168                } else {
169                    self.read_identifier()
170                }
171            }
172            [b's' | b'S', b't' | b'T', b'r' | b'R'] => {
173                if self.input.is_at(b"stringable-object", true) {
174                    (TypeTokenKind::StringableObject, 17)
175                } else {
176                    self.read_identifier()
177                }
178            }
179            [b'n' | b'N', b'u' | b'U', b'm' | b'M'] => {
180                if self.input.is_at(b"numeric-string", true) {
181                    (TypeTokenKind::NumericString, 14)
182                } else {
183                    self.read_identifier()
184                }
185            }
186            [b'l' | b'L', b'i' | b'I', b't' | b'T'] => {
187                if self.input.is_at(b"literal-string", true) {
188                    (TypeTokenKind::UnspecifiedLiteralString, 14)
189                } else if self.input.is_at(b"literal-int", true) {
190                    (TypeTokenKind::UnspecifiedLiteralInt, 11)
191                } else if self.input.is_at(b"literal-float", true) {
192                    (TypeTokenKind::UnspecifiedLiteralFloat, 13)
193                } else {
194                    self.read_identifier()
195                }
196            }
197            [b'l' | b'L', b'o' | b'O', b'w' | b'W'] => {
198                if self.input.is_at(b"lowercase-string", true) {
199                    (TypeTokenKind::LowercaseString, 16)
200                } else {
201                    self.read_identifier()
202                }
203            }
204            [b'o' | b'O', b'p' | b'P', b'e' | b'E'] => {
205                if self.input.is_at(b"open-resource", true) {
206                    (TypeTokenKind::OpenResource, 13)
207                } else {
208                    self.read_identifier()
209                }
210            }
211            [b'a' | b'A', b'r' | b'R', b'r' | b'R'] => {
212                if self.input.is_at(b"array-key", true) {
213                    (TypeTokenKind::ArrayKey, 9)
214                } else {
215                    self.read_identifier()
216                }
217            }
218            [b'n' | b'N', b'o' | b'O', b'-'] => {
219                if self.input.is_at(b"no-return", true) {
220                    (TypeTokenKind::NoReturn, 9)
221                } else {
222                    self.read_identifier()
223                }
224            }
225            [b'v' | b'V', b'a' | b'A', b'l' | b'L'] => {
226                if self.input.is_at(b"value-of", true) {
227                    (TypeTokenKind::ValueOf, 8)
228                } else {
229                    self.read_identifier()
230                }
231            }
232            [b'k' | b'K', b'e' | b'E', b'y' | b'Y'] => {
233                if self.input.is_at(b"key-of", true) {
234                    (TypeTokenKind::KeyOf, 6)
235                } else {
236                    self.read_identifier()
237                }
238            }
239            [b'p' | b'P', b'r' | b'R', b'o' | b'O'] => {
240                if self.input.is_at(b"protected-properties-of", true) {
241                    (TypeTokenKind::ProtectedPropertiesOf, 23)
242                } else if self.input.is_at(b"properties-of", true) {
243                    (TypeTokenKind::PropertiesOf, 13)
244                } else {
245                    self.read_identifier()
246                }
247            }
248            [b'p' | b'P', b'u' | b'U', b'b' | b'B'] => {
249                if self.input.is_at(b"public-properties-of", true) {
250                    (TypeTokenKind::PublicPropertiesOf, 20)
251                } else {
252                    self.read_identifier()
253                }
254            }
255            [b'p' | b'P', b'r' | b'R', b'i' | b'I'] => {
256                if self.input.is_at(b"private-properties-of", true) {
257                    (TypeTokenKind::PrivatePropertiesOf, 21)
258                } else {
259                    self.read_identifier()
260                }
261            }
262            [b'p' | b'P', b'o' | b'O', b's' | b'S'] => {
263                if self.input.is_at(b"positive-int", true) {
264                    (TypeTokenKind::PositiveInt, 12)
265                } else {
266                    self.read_identifier()
267                }
268            }
269            [b'n' | b'N', b'e' | b'E', b'g' | b'G'] => {
270                if self.input.is_at(b"negative-int", true) {
271                    (TypeTokenKind::NegativeInt, 12)
272                } else {
273                    self.read_identifier()
274                }
275            }
276            [b'.', b'.', b'.'] => (TypeTokenKind::Ellipsis, 3),
277            [b':', b':', ..] => (TypeTokenKind::ColonColon, 2),
278            [b'/', b'/', ..] => self.read_single_line_comment(),
279            [b'.', start_of_number!(), ..] => self.read_decimal(),
280            [start_of_number!(), ..] => self.read_number(),
281            [quote @ (b'\'' | b'"'), ..] => self.read_literal_string(*quote),
282            [b'\\', start_of_identifier!(), ..] => self.read_fully_qualified_identifier(),
283            [start_of_identifier!(), ..] => self.read_identifier(),
284            [b'$', start_of_identifier!(), ..] => {
285                let mut length = 2;
286                while let [part_of_identifier!(), ..] = self.input.peek(length, 1) {
287                    length += 1;
288                }
289
290                (TypeTokenKind::Variable, length)
291            }
292            [b':', ..] => (TypeTokenKind::Colon, 1),
293            [b'=', ..] => (TypeTokenKind::Equals, 1),
294            [b'?', ..] => (TypeTokenKind::Question, 1),
295            [b'!', ..] => (TypeTokenKind::Exclamation, 1),
296            [b'&', ..] => (TypeTokenKind::Ampersand, 1),
297            [b'|', ..] => (TypeTokenKind::Pipe, 1),
298            [b'>', ..] => (TypeTokenKind::GreaterThan, 1),
299            [b'<', ..] => (TypeTokenKind::LessThan, 1),
300            [b'(', ..] => (TypeTokenKind::LeftParenthesis, 1),
301            [b')', ..] => (TypeTokenKind::RightParenthesis, 1),
302            [b'[', ..] => (TypeTokenKind::LeftBracket, 1),
303            [b']', ..] => (TypeTokenKind::RightBracket, 1),
304            [b'{', ..] => (TypeTokenKind::LeftBrace, 1),
305            [b'}', ..] => (TypeTokenKind::RightBrace, 1),
306            [b',', ..] => (TypeTokenKind::Comma, 1),
307            [b'+', ..] => (TypeTokenKind::Plus, 1),
308            [b'-', ..] => (TypeTokenKind::Minus, 1),
309            [unknown_byte, ..] => {
310                return Some(Err(SyntaxError::UnrecognizedToken(
311                    self.file_id(),
312                    *unknown_byte,
313                    self.input.current_position(),
314                )));
315            }
316            [] => {
317                unreachable!()
318            }
319        };
320
321        let buffer = self.input.consume(length);
322        let end = self.input.current_position();
323
324        Some(Ok(self.token(kind, buffer, start, end)))
325    }
326
327    fn read_single_line_comment(&self) -> (TypeTokenKind, usize) {
328        let mut length = 2;
329        loop {
330            match self.input.peek(length, 1) {
331                [b'\n', ..] | [] => {
332                    break;
333                }
334                [_, ..] => {
335                    length += 1;
336                }
337            }
338        }
339
340        (TypeTokenKind::SingleLineComment, length)
341    }
342
343    fn read_decimal(&self) -> (TypeTokenKind, usize) {
344        let mut length = read_digits_of_base(&self.input, 2, 10);
345        if let float_exponent!() = self.input.peek(length, 1) {
346            length += 1;
347            if let number_sign!() = self.input.peek(length, 1) {
348                length += 1;
349            }
350
351            length = read_digits_of_base(&self.input, length, 10);
352        }
353
354        (TypeTokenKind::LiteralFloat, length)
355    }
356
357    fn read_number(&self) -> (TypeTokenKind, usize) {
358        #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
359        pub enum NumberKind {
360            Integer,
361            Float,
362            OctalOrFloat,
363            IntegerOrFloat,
364        }
365
366        let mut length = 1;
367
368        let (base, kind): (u8, NumberKind) = match self.input.read(3) {
369            start_of_binary_number!() => {
370                length += 1;
371
372                (2, NumberKind::Integer)
373            }
374            start_of_octal_number!() => {
375                length += 1;
376
377                (8, NumberKind::Integer)
378            }
379            start_of_hexadecimal_number!() => {
380                length += 1;
381
382                (16, NumberKind::Integer)
383            }
384            start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
385            start_of_float_number!() => (10, NumberKind::Float),
386            _ => (10, NumberKind::IntegerOrFloat),
387        };
388
389        if kind != NumberKind::Float {
390            length = read_digits_of_base(&self.input, length, base);
391
392            if kind == NumberKind::Integer {
393                return (TypeTokenKind::LiteralInteger, length);
394            }
395        }
396
397        let is_float = matches!(self.input.peek(length, 3), float_separator!());
398
399        if !is_float {
400            return (TypeTokenKind::LiteralInteger, length);
401        }
402
403        if let [b'.'] = self.input.peek(length, 1) {
404            length += 1;
405            length = read_digits_of_base(&self.input, length, 10);
406        }
407
408        if let float_exponent!() = self.input.peek(length, 1) {
409            length += 1;
410            if let number_sign!() = self.input.peek(length, 1) {
411                length += 1;
412            }
413
414            length = read_digits_of_base(&self.input, length, 10);
415        }
416
417        (TypeTokenKind::LiteralFloat, length)
418    }
419
420    fn read_literal_string(&self, quote: u8) -> (TypeTokenKind, usize) {
421        let total = self.input.len();
422        let start = self.input.current_offset();
423        let mut length = 1; // We assume the opening quote is already consumed.
424        let mut last_was_backslash = false;
425        let mut partial = false;
426
427        loop {
428            let pos = start + length;
429            if pos >= total {
430                // Reached EOF before closing quote.
431                partial = true;
432                break;
433            }
434
435            let byte = self.input.read_at(pos);
436            if matches!(byte, b'\\') {
437                // Toggle the backslash flag.
438                last_was_backslash = !last_was_backslash;
439                length += 1;
440            } else {
441                // If we see the closing quote and the previous byte was not an escape.
442                if byte == &quote && !last_was_backslash {
443                    length += 1; // Include the closing quote.
444                    break;
445                }
446
447                length += 1;
448                last_was_backslash = false;
449            }
450        }
451
452        if partial { (TypeTokenKind::PartialLiteralString, length) } else { (TypeTokenKind::LiteralString, length) }
453    }
454
455    fn read_fully_qualified_identifier(&self) -> (TypeTokenKind, usize) {
456        let mut length = 2;
457        let mut last_was_slash = false;
458        loop {
459            match self.input.peek(length, 1) {
460                [start_of_identifier!(), ..] if last_was_slash => {
461                    length += 1;
462                    last_was_slash = false;
463                }
464                [part_of_identifier!(), ..] if !last_was_slash => {
465                    length += 1;
466                }
467                [b'\\', ..] => {
468                    if last_was_slash {
469                        length -= 1;
470
471                        break;
472                    }
473
474                    length += 1;
475                    last_was_slash = true;
476                }
477                _ => {
478                    break;
479                }
480            }
481        }
482
483        (TypeTokenKind::FullyQualifiedIdentifier, length)
484    }
485
486    fn read_identifier(&self) -> (TypeTokenKind, usize) {
487        const KEYWORD_TYPES: [(&[u8], TypeTokenKind); 28] = [
488            (b"list", TypeTokenKind::List),
489            (b"int", TypeTokenKind::Int),
490            (b"integer", TypeTokenKind::Integer),
491            (b"string", TypeTokenKind::String),
492            (b"float", TypeTokenKind::Float),
493            (b"double", TypeTokenKind::Double),
494            (b"real", TypeTokenKind::Real),
495            (b"bool", TypeTokenKind::Bool),
496            (b"boolean", TypeTokenKind::Boolean),
497            (b"false", TypeTokenKind::False),
498            (b"true", TypeTokenKind::True),
499            (b"object", TypeTokenKind::Object),
500            (b"callable", TypeTokenKind::Callable),
501            (b"array", TypeTokenKind::Array),
502            (b"iterable", TypeTokenKind::Iterable),
503            (b"null", TypeTokenKind::Null),
504            (b"mixed", TypeTokenKind::Mixed),
505            (b"resource", TypeTokenKind::Resource),
506            (b"void", TypeTokenKind::Void),
507            (b"scalar", TypeTokenKind::Scalar),
508            (b"numeric", TypeTokenKind::Numeric),
509            (b"never", TypeTokenKind::Never),
510            (b"nothing", TypeTokenKind::Nothing),
511            (b"as", TypeTokenKind::As),
512            (b"is", TypeTokenKind::Is),
513            (b"not", TypeTokenKind::Not),
514            (b"min", TypeTokenKind::Min),
515            (b"max", TypeTokenKind::Max),
516        ];
517
518        let mut length = 1;
519        let mut ended_with_slash = false;
520        loop {
521            match self.input.peek(length, 2) {
522                [part_of_identifier!(), ..] => {
523                    length += 1;
524                }
525                [b'\\', start_of_identifier!(), ..] => {
526                    ended_with_slash = true;
527                    break;
528                }
529                _ => {
530                    break;
531                }
532            }
533        }
534
535        if !ended_with_slash {
536            for (value, kind) in KEYWORD_TYPES {
537                let keyword_length = value.len();
538                if keyword_length != length {
539                    continue;
540                }
541
542                if self.input.is_at(value, true) {
543                    return (kind, keyword_length);
544                }
545            }
546        }
547
548        let mut slashes = 0;
549        let mut last_was_slash = false;
550        loop {
551            match self.input.peek(length, 1) {
552                [start_of_identifier!(), ..] if last_was_slash => {
553                    length += 1;
554                    last_was_slash = false;
555                }
556                [part_of_identifier!(), ..] if !last_was_slash => {
557                    length += 1;
558                }
559                [b'\\', ..] => {
560                    if last_was_slash {
561                        length -= 1;
562                        slashes -= 1;
563                        last_was_slash = false;
564
565                        break;
566                    }
567                    length += 1;
568                    slashes += 1;
569                    last_was_slash = true;
570                }
571                _ => {
572                    break;
573                }
574            }
575        }
576
577        if last_was_slash {
578            length -= 1;
579            slashes -= 1;
580        }
581
582        if slashes > 0 { (TypeTokenKind::QualifiedIdentifier, length) } else { (TypeTokenKind::Identifier, length) }
583    }
584
585    #[inline]
586    fn token(&self, kind: TypeTokenKind, value: &'input [u8], from: Position, to: Position) -> TypeToken<'input> {
587        let mut value_chunks = value.utf8_chunks();
588        let value_str = if let Some(chunk) = value_chunks.next() {
589            let valid = chunk.valid();
590
591            debug_assert_eq!(valid.len(), value.len());
592
593            valid
594        } else {
595            ""
596        };
597
598        TypeToken { kind, value: value_str, span: Span::new(self.file_id(), from, to) }
599    }
600}
601
602impl HasFileId for TypeLexer<'_> {
603    fn file_id(&self) -> FileId {
604        self.input.file_id()
605    }
606}