mago_type_syntax/
lexer.rs

1use mago_database::file::FileId;
2use mago_database::file::HasFileId;
3use mago_span::Position;
4use mago_span::Span;
5use mago_syntax_core::float_exponent;
6use mago_syntax_core::float_separator;
7use mago_syntax_core::input::Input;
8use mago_syntax_core::number_sign;
9use mago_syntax_core::part_of_identifier;
10use mago_syntax_core::start_of_binary_number;
11use mago_syntax_core::start_of_float_number;
12use mago_syntax_core::start_of_hexadecimal_number;
13use mago_syntax_core::start_of_identifier;
14use mago_syntax_core::start_of_number;
15use mago_syntax_core::start_of_octal_number;
16use mago_syntax_core::start_of_octal_or_float_number;
17use mago_syntax_core::utils::read_digits_of_base;
18
19use crate::error::SyntaxError;
20use crate::token::TypeToken;
21use crate::token::TypeTokenKind;
22
23#[derive(Debug)]
24pub struct TypeLexer<'input> {
25    input: Input<'input>,
26}
27
28impl<'input> TypeLexer<'input> {
29    pub fn new(input: Input<'input>) -> TypeLexer<'input> {
30        TypeLexer { input }
31    }
32
33    pub fn has_reached_eof(&self) -> bool {
34        self.input.has_reached_eof()
35    }
36
37    pub fn current_position(&self) -> Position {
38        self.input.current_position()
39    }
40
41    /// Returns a string slice within a specified absolute range.
42    ///
43    /// This method exposes the underlying `Input::slice_in_range` functionality but
44    /// returns a `&str` instead of a `&[u8]`. It assumes the source is valid UTF-8.
45    ///
46    /// # Arguments
47    ///
48    /// * `from` - The absolute starting byte offset.
49    /// * `to` - The absolute ending byte offset (exclusive).
50    #[inline]
51    pub fn slice_in_range(&self, from: u32, to: u32) -> &'input str {
52        let bytes_slice = self.input.slice_in_range(from, to);
53
54        // Reuse the same safe UTF-8 conversion logic as the `token` method.
55        bytes_slice.utf8_chunks().next().map_or("", |chunk| chunk.valid())
56    }
57
58    #[inline]
59    pub fn advance(&mut self) -> Option<Result<TypeToken<'input>, SyntaxError>> {
60        if self.input.has_reached_eof() {
61            return None;
62        }
63
64        let start = self.input.current_position();
65        let whitespaces = self.input.consume_whitespaces();
66        if !whitespaces.is_empty() {
67            let end = self.input.current_position();
68
69            return self.token(TypeTokenKind::Whitespace, whitespaces, start, end);
70        }
71
72        let (kind, length) = match self.input.read(3) {
73            [b'*', ..] => (TypeTokenKind::Asterisk, 1),
74            [b'n' | b'N', b'o' | b'O', b'n' | b'N'] => {
75                if self.input.is_at(b"non-positive-int", true) {
76                    (TypeTokenKind::NonPositiveInt, 16)
77                } else if self.input.is_at(b"non-negative-int", true) {
78                    (TypeTokenKind::NonNegativeInt, 16)
79                } else if self.input.is_at(b"non-empty-literal-string", true) {
80                    (TypeTokenKind::NonEmptyUnspecifiedLiteralString, 26)
81                } else if self.input.is_at(b"non-empty-string", true) {
82                    (TypeTokenKind::NonEmptyString, 16)
83                } else if self.input.is_at(b"non-empty-array", true) {
84                    (TypeTokenKind::NonEmptyArray, 15)
85                } else if self.input.is_at(b"non-empty-list", true) {
86                    (TypeTokenKind::NonEmptyList, 14)
87                } else if self.input.is_at(b"non-falsy-string", true) {
88                    (TypeTokenKind::NonFalsyString, 16)
89                } else if self.input.is_at(b"non-empty-lowercase-string", true) {
90                    (TypeTokenKind::NonEmptyLowercaseString, 26)
91                } else {
92                    self.read_identifier()
93                }
94            }
95            [b'p' | b'P', b'u' | b'U', b'r' | b'R'] => {
96                if self.input.is_at(b"pure-closure", true) {
97                    (TypeTokenKind::PureClosure, 12)
98                } else if self.input.is_at(b"pure-callable", true) {
99                    (TypeTokenKind::PureCallable, 13)
100                } else {
101                    self.read_identifier()
102                }
103            }
104            [b'n' | b'N', b'e' | b'E', b'v' | b'V'] => {
105                if self.input.is_at(b"never-return", true) {
106                    (TypeTokenKind::NeverReturn, 12)
107                } else if self.input.is_at(b"never-returns", true) {
108                    (TypeTokenKind::NeverReturns, 13)
109                } else {
110                    self.read_identifier()
111                }
112            }
113            [b't' | b'T', b'r' | b'R', b'u' | b'U'] => {
114                if self.input.is_at(b"truthy-string", true) {
115                    (TypeTokenKind::TruthyString, 13)
116                } else {
117                    self.read_identifier()
118                }
119            }
120            [b't' | b'T', b'r' | b'R', b'a' | b'A'] => {
121                if self.input.is_at(b"trait-string", true) {
122                    (TypeTokenKind::TraitString, 12)
123                } else {
124                    self.read_identifier()
125                }
126            }
127            [b'a' | b'A', b's' | b'S', b's' | b'S'] => {
128                if self.input.is_at(b"associative-array", true) {
129                    (TypeTokenKind::AssociativeArray, 17)
130                } else {
131                    self.read_identifier()
132                }
133            }
134            [b'c' | b'C', b'l' | b'L', b'a' | b'A'] => {
135                if self.input.is_at(b"class-string", true) {
136                    (TypeTokenKind::ClassString, 12)
137                } else {
138                    self.read_identifier()
139                }
140            }
141            [b'e' | b'E', b'n' | b'N', b'u' | b'U'] => {
142                if self.input.is_at(b"enum-string", true) {
143                    (TypeTokenKind::EnumString, 11)
144                } else {
145                    self.read_identifier()
146                }
147            }
148            [b'i' | b'I', b'n' | b'N', b't' | b'T'] => {
149                if self.input.is_at(b"interface-string", true) {
150                    (TypeTokenKind::InterfaceString, 16)
151                } else {
152                    self.read_identifier()
153                }
154            }
155            [b'c' | b'C', b'l' | b'L', b'o' | b'O'] => {
156                if self.input.is_at(b"closed-resource", true) {
157                    (TypeTokenKind::ClosedResource, 15)
158                } else {
159                    self.read_identifier()
160                }
161            }
162            [b's' | b'S', b't' | b'T', b'r' | b'R'] => {
163                if self.input.is_at(b"stringable-object", true) {
164                    (TypeTokenKind::StringableObject, 17)
165                } else {
166                    self.read_identifier()
167                }
168            }
169            [b'n' | b'N', b'u' | b'U', b'm' | b'M'] => {
170                if self.input.is_at(b"numeric-string", true) {
171                    (TypeTokenKind::NumericString, 14)
172                } else {
173                    self.read_identifier()
174                }
175            }
176            [b'l' | b'L', b'i' | b'I', b't' | b'T'] => {
177                if self.input.is_at(b"literal-string", true) {
178                    (TypeTokenKind::UnspecifiedLiteralString, 14)
179                } else if self.input.is_at(b"literal-int", true) {
180                    (TypeTokenKind::UnspecifiedLiteralInt, 11)
181                } else {
182                    self.read_identifier()
183                }
184            }
185            [b'l' | b'L', b'o' | b'O', b'w' | b'W'] => {
186                if self.input.is_at(b"lowercase-string", true) {
187                    (TypeTokenKind::LowercaseString, 16)
188                } else {
189                    self.read_identifier()
190                }
191            }
192            [b'o' | b'O', b'p' | b'P', b'e' | b'E'] => {
193                if self.input.is_at(b"open-resource", true) {
194                    (TypeTokenKind::OpenResource, 13)
195                } else {
196                    self.read_identifier()
197                }
198            }
199            [b'a' | b'A', b'r' | b'R', b'r' | b'R'] => {
200                if self.input.is_at(b"array-key", true) {
201                    (TypeTokenKind::ArrayKey, 9)
202                } else {
203                    self.read_identifier()
204                }
205            }
206            [b'n' | b'N', b'o' | b'O', b'-'] => {
207                if self.input.is_at(b"no-return", true) {
208                    (TypeTokenKind::NoReturn, 9)
209                } else {
210                    self.read_identifier()
211                }
212            }
213            [b'v' | b'V', b'a' | b'A', b'l' | b'L'] => {
214                if self.input.is_at(b"value-of", true) {
215                    (TypeTokenKind::ValueOf, 8)
216                } else {
217                    self.read_identifier()
218                }
219            }
220            [b'k' | b'K', b'e' | b'E', b'y' | b'Y'] => {
221                if self.input.is_at(b"key-of", true) {
222                    (TypeTokenKind::KeyOf, 6)
223                } else {
224                    self.read_identifier()
225                }
226            }
227            [b'p' | b'P', b'r' | b'R', b'o' | b'O'] => {
228                if self.input.is_at(b"protected-properties-of", true) {
229                    (TypeTokenKind::ProtectedPropertiesOf, 23)
230                } else if self.input.is_at(b"properties-of", true) {
231                    (TypeTokenKind::PropertiesOf, 13)
232                } else {
233                    self.read_identifier()
234                }
235            }
236            [b'p' | b'P', b'u' | b'U', b'b' | b'B'] => {
237                if self.input.is_at(b"public-properties-of", true) {
238                    (TypeTokenKind::PublicPropertiesOf, 20)
239                } else {
240                    self.read_identifier()
241                }
242            }
243            [b'p' | b'P', b'r' | b'R', b'i' | b'I'] => {
244                if self.input.is_at(b"private-properties-of", true) {
245                    (TypeTokenKind::PrivatePropertiesOf, 21)
246                } else {
247                    self.read_identifier()
248                }
249            }
250            [b'p' | b'P', b'o' | b'O', b's' | b'S'] => {
251                if self.input.is_at(b"positive-int", true) {
252                    (TypeTokenKind::PositiveInt, 12)
253                } else {
254                    self.read_identifier()
255                }
256            }
257            [b'n' | b'N', b'e' | b'E', b'g' | b'G'] => {
258                if self.input.is_at(b"negative-int", true) {
259                    (TypeTokenKind::NegativeInt, 12)
260                } else {
261                    self.read_identifier()
262                }
263            }
264            [b'.', b'.', b'.'] => (TypeTokenKind::Ellipsis, 3),
265            [b':', b':', ..] => (TypeTokenKind::ColonColon, 2),
266            [b'/', b'/', ..] => self.read_single_line_comment(),
267            [b'.', start_of_number!(), ..] => self.read_decimal(),
268            [start_of_number!(), ..] => self.read_number(),
269            [quote @ b'\'' | quote @ b'"', ..] => self.read_literal_string(quote),
270            [b'\\', start_of_identifier!(), ..] => self.read_fully_qualified_identifier(),
271            [start_of_identifier!(), ..] => self.read_identifier(),
272            [b'$', start_of_identifier!(), ..] => {
273                let mut length = 2;
274                while let [part_of_identifier!(), ..] = self.input.peek(length, 1) {
275                    length += 1;
276                }
277
278                (TypeTokenKind::Variable, length)
279            }
280            [b':', ..] => (TypeTokenKind::Colon, 1),
281            [b'=', ..] => (TypeTokenKind::Equals, 1),
282            [b'?', ..] => (TypeTokenKind::Question, 1),
283            [b'&', ..] => (TypeTokenKind::Ampersand, 1),
284            [b'|', ..] => (TypeTokenKind::Pipe, 1),
285            [b'>', ..] => (TypeTokenKind::GreaterThan, 1),
286            [b'<', ..] => (TypeTokenKind::LessThan, 1),
287            [b'(', ..] => (TypeTokenKind::LeftParenthesis, 1),
288            [b')', ..] => (TypeTokenKind::RightParenthesis, 1),
289            [b'[', ..] => (TypeTokenKind::LeftBracket, 1),
290            [b']', ..] => (TypeTokenKind::RightBracket, 1),
291            [b'{', ..] => (TypeTokenKind::LeftBrace, 1),
292            [b'}', ..] => (TypeTokenKind::RightBrace, 1),
293            [b',', ..] => (TypeTokenKind::Comma, 1),
294            [b'+', ..] => (TypeTokenKind::Plus, 1),
295            [b'-', ..] => (TypeTokenKind::Minus, 1),
296            [unknown_byte, ..] => {
297                return Some(Err(SyntaxError::UnrecognizedToken(
298                    self.file_id(),
299                    *unknown_byte,
300                    self.input.current_position(),
301                )));
302            }
303            [] => {
304                unreachable!()
305            }
306        };
307
308        let buffer = self.input.consume(length);
309        let end = self.input.current_position();
310
311        self.token(kind, buffer, start, end)
312    }
313
314    fn read_single_line_comment(&self) -> (TypeTokenKind, usize) {
315        let mut length = 2;
316        loop {
317            match self.input.peek(length, 1) {
318                [b'\n', ..] | [] => {
319                    break;
320                }
321                [_, ..] => {
322                    length += 1;
323                }
324            }
325        }
326
327        (TypeTokenKind::SingleLineComment, length)
328    }
329
330    fn read_decimal(&self) -> (TypeTokenKind, usize) {
331        let mut length = read_digits_of_base(&self.input, 2, 10);
332        if let float_exponent!() = self.input.peek(length, 1) {
333            length += 1;
334            if let number_sign!() = self.input.peek(length, 1) {
335                length += 1;
336            }
337
338            length = read_digits_of_base(&self.input, length, 10);
339        }
340
341        (TypeTokenKind::LiteralFloat, length)
342    }
343
344    fn read_number(&self) -> (TypeTokenKind, usize) {
345        #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
346        pub enum NumberKind {
347            Integer,
348            Float,
349            OctalOrFloat,
350            IntegerOrFloat,
351        }
352
353        let mut length = 1;
354
355        let (base, kind): (u8, NumberKind) = match self.input.read(3) {
356            start_of_binary_number!() => {
357                length += 1;
358
359                (2, NumberKind::Integer)
360            }
361            start_of_octal_number!() => {
362                length += 1;
363
364                (8, NumberKind::Integer)
365            }
366            start_of_hexadecimal_number!() => {
367                length += 1;
368
369                (16, NumberKind::Integer)
370            }
371            start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
372            start_of_float_number!() => (10, NumberKind::Float),
373            _ => (10, NumberKind::IntegerOrFloat),
374        };
375
376        if kind != NumberKind::Float {
377            length = read_digits_of_base(&self.input, length, base);
378
379            if kind == NumberKind::Integer {
380                return (TypeTokenKind::LiteralInteger, length);
381            }
382        }
383
384        let is_float = matches!(self.input.peek(length, 3), float_separator!());
385
386        if !is_float {
387            return (TypeTokenKind::LiteralInteger, length);
388        }
389
390        if let [b'.'] = self.input.peek(length, 1) {
391            length += 1;
392            length = read_digits_of_base(&self.input, length, 10);
393        }
394
395        if let float_exponent!() = self.input.peek(length, 1) {
396            length += 1;
397            if let number_sign!() = self.input.peek(length, 1) {
398                length += 1;
399            }
400
401            length = read_digits_of_base(&self.input, length, 10);
402        }
403
404        (TypeTokenKind::LiteralFloat, length)
405    }
406
407    fn read_literal_string(&self, quote: &u8) -> (TypeTokenKind, usize) {
408        let total = self.input.len();
409        let start = self.input.current_offset();
410        let mut length = 1; // We assume the opening quote is already consumed.
411        let mut last_was_backslash = false;
412        let mut partial = false;
413
414        loop {
415            let pos = start + length;
416            if pos >= total {
417                // Reached EOF before closing quote.
418                partial = true;
419                break;
420            }
421
422            let byte = self.input.read_at(pos);
423            if matches!(byte, b'\\') {
424                // Toggle the backslash flag.
425                last_was_backslash = !last_was_backslash;
426                length += 1;
427            } else {
428                // If we see the closing quote and the previous byte was not an escape.
429                if byte == quote && !last_was_backslash {
430                    length += 1; // Include the closing quote.
431                    break;
432                }
433
434                length += 1;
435                last_was_backslash = false;
436            }
437        }
438
439        if partial { (TypeTokenKind::PartialLiteralString, length) } else { (TypeTokenKind::LiteralString, length) }
440    }
441
442    fn read_fully_qualified_identifier(&self) -> (TypeTokenKind, usize) {
443        let mut length = 2;
444        let mut last_was_slash = false;
445        loop {
446            match self.input.peek(length, 1) {
447                [start_of_identifier!(), ..] if last_was_slash => {
448                    length += 1;
449                    last_was_slash = false;
450                }
451                [part_of_identifier!(), ..] if !last_was_slash => {
452                    length += 1;
453                }
454                [b'\\', ..] => {
455                    if last_was_slash {
456                        length -= 1;
457
458                        break;
459                    }
460
461                    length += 1;
462                    last_was_slash = true;
463                }
464                _ => {
465                    break;
466                }
467            }
468        }
469
470        (TypeTokenKind::FullyQualifiedIdentifier, length)
471    }
472
473    fn read_identifier(&self) -> (TypeTokenKind, usize) {
474        const KEYWORD_TYPES: [(&[u8], TypeTokenKind); 28] = [
475            (b"list", TypeTokenKind::List),
476            (b"int", TypeTokenKind::Int),
477            (b"integer", TypeTokenKind::Integer),
478            (b"string", TypeTokenKind::String),
479            (b"float", TypeTokenKind::Float),
480            (b"double", TypeTokenKind::Double),
481            (b"real", TypeTokenKind::Real),
482            (b"bool", TypeTokenKind::Bool),
483            (b"boolean", TypeTokenKind::Boolean),
484            (b"false", TypeTokenKind::False),
485            (b"true", TypeTokenKind::True),
486            (b"object", TypeTokenKind::Object),
487            (b"callable", TypeTokenKind::Callable),
488            (b"array", TypeTokenKind::Array),
489            (b"iterable", TypeTokenKind::Iterable),
490            (b"null", TypeTokenKind::Null),
491            (b"mixed", TypeTokenKind::Mixed),
492            (b"resource", TypeTokenKind::Resource),
493            (b"void", TypeTokenKind::Void),
494            (b"scalar", TypeTokenKind::Scalar),
495            (b"numeric", TypeTokenKind::Numeric),
496            (b"never", TypeTokenKind::Never),
497            (b"nothing", TypeTokenKind::Nothing),
498            (b"as", TypeTokenKind::As),
499            (b"is", TypeTokenKind::Is),
500            (b"not", TypeTokenKind::Not),
501            (b"min", TypeTokenKind::Min),
502            (b"max", TypeTokenKind::Max),
503        ];
504
505        let mut length = 1;
506        let mut ended_with_slash = false;
507        loop {
508            match self.input.peek(length, 2) {
509                [part_of_identifier!(), ..] => {
510                    length += 1;
511                }
512                [b'\\', start_of_identifier!(), ..] => {
513                    ended_with_slash = true;
514                    break;
515                }
516                _ => {
517                    break;
518                }
519            }
520        }
521
522        if !ended_with_slash {
523            for (value, kind) in KEYWORD_TYPES {
524                let keyword_length = value.len();
525                if keyword_length != length {
526                    continue;
527                }
528
529                if self.input.is_at(value, true) {
530                    return (kind, keyword_length);
531                }
532            }
533        }
534
535        let mut slashes = 0;
536        let mut last_was_slash = false;
537        loop {
538            match self.input.peek(length, 1) {
539                [start_of_identifier!(), ..] if last_was_slash => {
540                    length += 1;
541                    last_was_slash = false;
542                }
543                [part_of_identifier!(), ..] if !last_was_slash => {
544                    length += 1;
545                }
546                [b'\\', ..] => {
547                    if !last_was_slash {
548                        length += 1;
549                        slashes += 1;
550                        last_was_slash = true;
551                    } else {
552                        length -= 1;
553                        slashes -= 1;
554                        last_was_slash = false;
555
556                        break;
557                    }
558                }
559                _ => {
560                    break;
561                }
562            }
563        }
564
565        if last_was_slash {
566            length -= 1;
567            slashes -= 1;
568        }
569
570        if slashes > 0 { (TypeTokenKind::QualifiedIdentifier, length) } else { (TypeTokenKind::Identifier, length) }
571    }
572
573    #[inline]
574    fn token(
575        &self,
576        kind: TypeTokenKind,
577        value: &'input [u8],
578        from: Position,
579        to: Position,
580    ) -> Option<Result<TypeToken<'input>, SyntaxError>> {
581        let mut value_chunks = value.utf8_chunks();
582        let value_str = if let Some(chunk) = value_chunks.next() {
583            let valid = chunk.valid();
584
585            debug_assert_eq!(valid.len(), value.len());
586
587            valid
588        } else {
589            ""
590        };
591
592        Some(Ok(TypeToken { kind, value: value_str, span: Span::new(self.file_id(), from, to) }))
593    }
594}
595
596impl HasFileId for TypeLexer<'_> {
597    fn file_id(&self) -> FileId {
598        self.input.file_id()
599    }
600}