1use mago_database::file::FileId;
2use mago_database::file::HasFileId;
3use mago_span::Position;
4use mago_span::Span;
5use mago_syntax_core::float_exponent;
6use mago_syntax_core::float_separator;
7use mago_syntax_core::input::Input;
8use mago_syntax_core::number_sign;
9use mago_syntax_core::part_of_identifier;
10use mago_syntax_core::start_of_binary_number;
11use mago_syntax_core::start_of_float_number;
12use mago_syntax_core::start_of_hexadecimal_number;
13use mago_syntax_core::start_of_identifier;
14use mago_syntax_core::start_of_number;
15use mago_syntax_core::start_of_octal_number;
16use mago_syntax_core::start_of_octal_or_float_number;
17use mago_syntax_core::utils::read_digits_of_base;
18
19use crate::error::SyntaxError;
20use crate::token::TypeToken;
21use crate::token::TypeTokenKind;
22
23#[derive(Debug)]
24pub struct TypeLexer<'input> {
25 input: Input<'input>,
26}
27
28impl<'input> TypeLexer<'input> {
29 #[must_use]
30 pub fn new(input: Input<'input>) -> TypeLexer<'input> {
31 TypeLexer { input }
32 }
33
34 #[must_use]
35 pub fn has_reached_eof(&self) -> bool {
36 self.input.has_reached_eof()
37 }
38
39 #[must_use]
40 pub fn current_position(&self) -> Position {
41 self.input.current_position()
42 }
43
44 #[inline]
54 #[must_use]
55 pub fn slice_in_range(&self, from: u32, to: u32) -> &'input str {
56 let bytes_slice = self.input.slice_in_range(from, to);
57
58 bytes_slice.utf8_chunks().next().map_or("", |chunk| chunk.valid())
60 }
61
62 #[inline]
63 pub fn advance(&mut self) -> Option<Result<TypeToken<'input>, SyntaxError>> {
64 if self.input.has_reached_eof() {
65 return None;
66 }
67
68 let start = self.input.current_position();
69 let whitespaces = self.input.consume_whitespaces();
70 if !whitespaces.is_empty() {
71 let end = self.input.current_position();
72
73 return Some(Ok(self.token(TypeTokenKind::Whitespace, whitespaces, start, end)));
74 }
75
76 let (kind, length) = match self.input.read(3) {
77 [b'*', ..] => (TypeTokenKind::Asterisk, 1),
78 [b'n' | b'N', b'o' | b'O', b'n' | b'N'] => {
79 if self.input.is_at(b"non-positive-int", true) {
80 (TypeTokenKind::NonPositiveInt, 16)
81 } else if self.input.is_at(b"non-negative-int", true) {
82 (TypeTokenKind::NonNegativeInt, 16)
83 } else if self.input.is_at(b"non-empty-literal-string", true) {
84 (TypeTokenKind::NonEmptyUnspecifiedLiteralString, 26)
85 } else if self.input.is_at(b"non-empty-string", true) {
86 (TypeTokenKind::NonEmptyString, 16)
87 } else if self.input.is_at(b"non-empty-array", true) {
88 (TypeTokenKind::NonEmptyArray, 15)
89 } else if self.input.is_at(b"non-empty-list", true) {
90 (TypeTokenKind::NonEmptyList, 14)
91 } else if self.input.is_at(b"non-falsy-string", true) {
92 (TypeTokenKind::NonFalsyString, 16)
93 } else if self.input.is_at(b"non-empty-lowercase-string", true) {
94 (TypeTokenKind::NonEmptyLowercaseString, 26)
95 } else {
96 self.read_identifier()
97 }
98 }
99 [b'p' | b'P', b'u' | b'U', b'r' | b'R'] => {
100 if self.input.is_at(b"pure-closure", true) {
101 (TypeTokenKind::PureClosure, 12)
102 } else if self.input.is_at(b"pure-callable", true) {
103 (TypeTokenKind::PureCallable, 13)
104 } else {
105 self.read_identifier()
106 }
107 }
108 [b'n' | b'N', b'e' | b'E', b'v' | b'V'] => {
109 if self.input.is_at(b"never-return", true) {
110 (TypeTokenKind::NeverReturn, 12)
111 } else if self.input.is_at(b"never-returns", true) {
112 (TypeTokenKind::NeverReturns, 13)
113 } else {
114 self.read_identifier()
115 }
116 }
117 [b't' | b'T', b'r' | b'R', b'u' | b'U'] => {
118 if self.input.is_at(b"truthy-string", true) {
119 (TypeTokenKind::TruthyString, 13)
120 } else {
121 self.read_identifier()
122 }
123 }
124 [b't' | b'T', b'r' | b'R', b'a' | b'A'] => {
125 if self.input.is_at(b"trait-string", true) {
126 (TypeTokenKind::TraitString, 12)
127 } else {
128 self.read_identifier()
129 }
130 }
131 [b'a' | b'A', b's' | b'S', b's' | b'S'] => {
132 if self.input.is_at(b"associative-array", true) {
133 (TypeTokenKind::AssociativeArray, 17)
134 } else {
135 self.read_identifier()
136 }
137 }
138 [b'c' | b'C', b'l' | b'L', b'a' | b'A'] => {
139 if self.input.is_at(b"class-string", true) {
140 (TypeTokenKind::ClassString, 12)
141 } else {
142 self.read_identifier()
143 }
144 }
145 [b'e' | b'E', b'n' | b'N', b'u' | b'U'] => {
146 if self.input.is_at(b"enum-string", true) {
147 (TypeTokenKind::EnumString, 11)
148 } else {
149 self.read_identifier()
150 }
151 }
152 [b'i' | b'I', b'n' | b'N', b't' | b'T'] => {
153 if self.input.is_at(b"interface-string", true) {
154 (TypeTokenKind::InterfaceString, 16)
155 } else if self.input.is_at(b"int-mask-of", true) {
156 (TypeTokenKind::IntMaskOf, 11)
157 } else if self.input.is_at(b"int-mask", true) {
158 (TypeTokenKind::IntMask, 8)
159 } else {
160 self.read_identifier()
161 }
162 }
163 [b'c' | b'C', b'l' | b'L', b'o' | b'O'] => {
164 if self.input.is_at(b"closed-resource", true) {
165 (TypeTokenKind::ClosedResource, 15)
166 } else {
167 self.read_identifier()
168 }
169 }
170 [b's' | b'S', b't' | b'T', b'r' | b'R'] => {
171 if self.input.is_at(b"stringable-object", true) {
172 (TypeTokenKind::StringableObject, 17)
173 } else {
174 self.read_identifier()
175 }
176 }
177 [b'n' | b'N', b'u' | b'U', b'm' | b'M'] => {
178 if self.input.is_at(b"numeric-string", true) {
179 (TypeTokenKind::NumericString, 14)
180 } else {
181 self.read_identifier()
182 }
183 }
184 [b'l' | b'L', b'i' | b'I', b't' | b'T'] => {
185 if self.input.is_at(b"literal-string", true) {
186 (TypeTokenKind::UnspecifiedLiteralString, 14)
187 } else if self.input.is_at(b"literal-int", true) {
188 (TypeTokenKind::UnspecifiedLiteralInt, 11)
189 } else if self.input.is_at(b"literal-float", true) {
190 (TypeTokenKind::UnspecifiedLiteralFloat, 13)
191 } else {
192 self.read_identifier()
193 }
194 }
195 [b'l' | b'L', b'o' | b'O', b'w' | b'W'] => {
196 if self.input.is_at(b"lowercase-string", true) {
197 (TypeTokenKind::LowercaseString, 16)
198 } else {
199 self.read_identifier()
200 }
201 }
202 [b'o' | b'O', b'p' | b'P', b'e' | b'E'] => {
203 if self.input.is_at(b"open-resource", true) {
204 (TypeTokenKind::OpenResource, 13)
205 } else {
206 self.read_identifier()
207 }
208 }
209 [b'a' | b'A', b'r' | b'R', b'r' | b'R'] => {
210 if self.input.is_at(b"array-key", true) {
211 (TypeTokenKind::ArrayKey, 9)
212 } else {
213 self.read_identifier()
214 }
215 }
216 [b'n' | b'N', b'o' | b'O', b'-'] => {
217 if self.input.is_at(b"no-return", true) {
218 (TypeTokenKind::NoReturn, 9)
219 } else {
220 self.read_identifier()
221 }
222 }
223 [b'v' | b'V', b'a' | b'A', b'l' | b'L'] => {
224 if self.input.is_at(b"value-of", true) {
225 (TypeTokenKind::ValueOf, 8)
226 } else {
227 self.read_identifier()
228 }
229 }
230 [b'k' | b'K', b'e' | b'E', b'y' | b'Y'] => {
231 if self.input.is_at(b"key-of", true) {
232 (TypeTokenKind::KeyOf, 6)
233 } else {
234 self.read_identifier()
235 }
236 }
237 [b'p' | b'P', b'r' | b'R', b'o' | b'O'] => {
238 if self.input.is_at(b"protected-properties-of", true) {
239 (TypeTokenKind::ProtectedPropertiesOf, 23)
240 } else if self.input.is_at(b"properties-of", true) {
241 (TypeTokenKind::PropertiesOf, 13)
242 } else {
243 self.read_identifier()
244 }
245 }
246 [b'p' | b'P', b'u' | b'U', b'b' | b'B'] => {
247 if self.input.is_at(b"public-properties-of", true) {
248 (TypeTokenKind::PublicPropertiesOf, 20)
249 } else {
250 self.read_identifier()
251 }
252 }
253 [b'p' | b'P', b'r' | b'R', b'i' | b'I'] => {
254 if self.input.is_at(b"private-properties-of", true) {
255 (TypeTokenKind::PrivatePropertiesOf, 21)
256 } else {
257 self.read_identifier()
258 }
259 }
260 [b'p' | b'P', b'o' | b'O', b's' | b'S'] => {
261 if self.input.is_at(b"positive-int", true) {
262 (TypeTokenKind::PositiveInt, 12)
263 } else {
264 self.read_identifier()
265 }
266 }
267 [b'n' | b'N', b'e' | b'E', b'g' | b'G'] => {
268 if self.input.is_at(b"negative-int", true) {
269 (TypeTokenKind::NegativeInt, 12)
270 } else {
271 self.read_identifier()
272 }
273 }
274 [b'.', b'.', b'.'] => (TypeTokenKind::Ellipsis, 3),
275 [b':', b':', ..] => (TypeTokenKind::ColonColon, 2),
276 [b'/', b'/', ..] => self.read_single_line_comment(),
277 [b'.', start_of_number!(), ..] => self.read_decimal(),
278 [start_of_number!(), ..] => self.read_number(),
279 [quote @ (b'\'' | b'"'), ..] => self.read_literal_string(*quote),
280 [b'\\', start_of_identifier!(), ..] => self.read_fully_qualified_identifier(),
281 [start_of_identifier!(), ..] => self.read_identifier(),
282 [b'$', start_of_identifier!(), ..] => {
283 let mut length = 2;
284 while let [part_of_identifier!(), ..] = self.input.peek(length, 1) {
285 length += 1;
286 }
287
288 (TypeTokenKind::Variable, length)
289 }
290 [b':', ..] => (TypeTokenKind::Colon, 1),
291 [b'=', ..] => (TypeTokenKind::Equals, 1),
292 [b'?', ..] => (TypeTokenKind::Question, 1),
293 [b'!', ..] => (TypeTokenKind::Exclamation, 1),
294 [b'&', ..] => (TypeTokenKind::Ampersand, 1),
295 [b'|', ..] => (TypeTokenKind::Pipe, 1),
296 [b'>', ..] => (TypeTokenKind::GreaterThan, 1),
297 [b'<', ..] => (TypeTokenKind::LessThan, 1),
298 [b'(', ..] => (TypeTokenKind::LeftParenthesis, 1),
299 [b')', ..] => (TypeTokenKind::RightParenthesis, 1),
300 [b'[', ..] => (TypeTokenKind::LeftBracket, 1),
301 [b']', ..] => (TypeTokenKind::RightBracket, 1),
302 [b'{', ..] => (TypeTokenKind::LeftBrace, 1),
303 [b'}', ..] => (TypeTokenKind::RightBrace, 1),
304 [b',', ..] => (TypeTokenKind::Comma, 1),
305 [b'+', ..] => (TypeTokenKind::Plus, 1),
306 [b'-', ..] => (TypeTokenKind::Minus, 1),
307 [unknown_byte, ..] => {
308 return Some(Err(SyntaxError::UnrecognizedToken(
309 self.file_id(),
310 *unknown_byte,
311 self.input.current_position(),
312 )));
313 }
314 [] => {
315 unreachable!()
316 }
317 };
318
319 let buffer = self.input.consume(length);
320 let end = self.input.current_position();
321
322 Some(Ok(self.token(kind, buffer, start, end)))
323 }
324
325 fn read_single_line_comment(&self) -> (TypeTokenKind, usize) {
326 let mut length = 2;
327 loop {
328 match self.input.peek(length, 1) {
329 [b'\n', ..] | [] => {
330 break;
331 }
332 [_, ..] => {
333 length += 1;
334 }
335 }
336 }
337
338 (TypeTokenKind::SingleLineComment, length)
339 }
340
341 fn read_decimal(&self) -> (TypeTokenKind, usize) {
342 let mut length = read_digits_of_base(&self.input, 2, 10);
343 if let float_exponent!() = self.input.peek(length, 1) {
344 length += 1;
345 if let number_sign!() = self.input.peek(length, 1) {
346 length += 1;
347 }
348
349 length = read_digits_of_base(&self.input, length, 10);
350 }
351
352 (TypeTokenKind::LiteralFloat, length)
353 }
354
355 fn read_number(&self) -> (TypeTokenKind, usize) {
356 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
357 pub enum NumberKind {
358 Integer,
359 Float,
360 OctalOrFloat,
361 IntegerOrFloat,
362 }
363
364 let mut length = 1;
365
366 let (base, kind): (u8, NumberKind) = match self.input.read(3) {
367 start_of_binary_number!() => {
368 length += 1;
369
370 (2, NumberKind::Integer)
371 }
372 start_of_octal_number!() => {
373 length += 1;
374
375 (8, NumberKind::Integer)
376 }
377 start_of_hexadecimal_number!() => {
378 length += 1;
379
380 (16, NumberKind::Integer)
381 }
382 start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
383 start_of_float_number!() => (10, NumberKind::Float),
384 _ => (10, NumberKind::IntegerOrFloat),
385 };
386
387 if kind != NumberKind::Float {
388 length = read_digits_of_base(&self.input, length, base);
389
390 if kind == NumberKind::Integer {
391 return (TypeTokenKind::LiteralInteger, length);
392 }
393 }
394
395 let is_float = matches!(self.input.peek(length, 3), float_separator!());
396
397 if !is_float {
398 return (TypeTokenKind::LiteralInteger, length);
399 }
400
401 if let [b'.'] = self.input.peek(length, 1) {
402 length += 1;
403 length = read_digits_of_base(&self.input, length, 10);
404 }
405
406 if let float_exponent!() = self.input.peek(length, 1) {
407 length += 1;
408 if let number_sign!() = self.input.peek(length, 1) {
409 length += 1;
410 }
411
412 length = read_digits_of_base(&self.input, length, 10);
413 }
414
415 (TypeTokenKind::LiteralFloat, length)
416 }
417
418 fn read_literal_string(&self, quote: u8) -> (TypeTokenKind, usize) {
419 let total = self.input.len();
420 let start = self.input.current_offset();
421 let mut length = 1; let mut last_was_backslash = false;
423 let mut partial = false;
424
425 loop {
426 let pos = start + length;
427 if pos >= total {
428 partial = true;
430 break;
431 }
432
433 let byte = self.input.read_at(pos);
434 if matches!(byte, b'\\') {
435 last_was_backslash = !last_was_backslash;
437 length += 1;
438 } else {
439 if byte == "e && !last_was_backslash {
441 length += 1; break;
443 }
444
445 length += 1;
446 last_was_backslash = false;
447 }
448 }
449
450 if partial { (TypeTokenKind::PartialLiteralString, length) } else { (TypeTokenKind::LiteralString, length) }
451 }
452
453 fn read_fully_qualified_identifier(&self) -> (TypeTokenKind, usize) {
454 let mut length = 2;
455 let mut last_was_slash = false;
456 loop {
457 match self.input.peek(length, 1) {
458 [start_of_identifier!(), ..] if last_was_slash => {
459 length += 1;
460 last_was_slash = false;
461 }
462 [part_of_identifier!(), ..] if !last_was_slash => {
463 length += 1;
464 }
465 [b'\\', ..] => {
466 if last_was_slash {
467 length -= 1;
468
469 break;
470 }
471
472 length += 1;
473 last_was_slash = true;
474 }
475 _ => {
476 break;
477 }
478 }
479 }
480
481 (TypeTokenKind::FullyQualifiedIdentifier, length)
482 }
483
484 fn read_identifier(&self) -> (TypeTokenKind, usize) {
485 const KEYWORD_TYPES: [(&[u8], TypeTokenKind); 28] = [
486 (b"list", TypeTokenKind::List),
487 (b"int", TypeTokenKind::Int),
488 (b"integer", TypeTokenKind::Integer),
489 (b"string", TypeTokenKind::String),
490 (b"float", TypeTokenKind::Float),
491 (b"double", TypeTokenKind::Double),
492 (b"real", TypeTokenKind::Real),
493 (b"bool", TypeTokenKind::Bool),
494 (b"boolean", TypeTokenKind::Boolean),
495 (b"false", TypeTokenKind::False),
496 (b"true", TypeTokenKind::True),
497 (b"object", TypeTokenKind::Object),
498 (b"callable", TypeTokenKind::Callable),
499 (b"array", TypeTokenKind::Array),
500 (b"iterable", TypeTokenKind::Iterable),
501 (b"null", TypeTokenKind::Null),
502 (b"mixed", TypeTokenKind::Mixed),
503 (b"resource", TypeTokenKind::Resource),
504 (b"void", TypeTokenKind::Void),
505 (b"scalar", TypeTokenKind::Scalar),
506 (b"numeric", TypeTokenKind::Numeric),
507 (b"never", TypeTokenKind::Never),
508 (b"nothing", TypeTokenKind::Nothing),
509 (b"as", TypeTokenKind::As),
510 (b"is", TypeTokenKind::Is),
511 (b"not", TypeTokenKind::Not),
512 (b"min", TypeTokenKind::Min),
513 (b"max", TypeTokenKind::Max),
514 ];
515
516 let mut length = 1;
517 let mut ended_with_slash = false;
518 loop {
519 match self.input.peek(length, 2) {
520 [part_of_identifier!(), ..] => {
521 length += 1;
522 }
523 [b'\\', start_of_identifier!(), ..] => {
524 ended_with_slash = true;
525 break;
526 }
527 _ => {
528 break;
529 }
530 }
531 }
532
533 if !ended_with_slash {
534 for (value, kind) in KEYWORD_TYPES {
535 let keyword_length = value.len();
536 if keyword_length != length {
537 continue;
538 }
539
540 if self.input.is_at(value, true) {
541 return (kind, keyword_length);
542 }
543 }
544 }
545
546 let mut slashes = 0;
547 let mut last_was_slash = false;
548 loop {
549 match self.input.peek(length, 1) {
550 [start_of_identifier!(), ..] if last_was_slash => {
551 length += 1;
552 last_was_slash = false;
553 }
554 [part_of_identifier!(), ..] if !last_was_slash => {
555 length += 1;
556 }
557 [b'\\', ..] => {
558 if last_was_slash {
559 length -= 1;
560 slashes -= 1;
561 last_was_slash = false;
562
563 break;
564 }
565 length += 1;
566 slashes += 1;
567 last_was_slash = true;
568 }
569 _ => {
570 break;
571 }
572 }
573 }
574
575 if last_was_slash {
576 length -= 1;
577 slashes -= 1;
578 }
579
580 if slashes > 0 { (TypeTokenKind::QualifiedIdentifier, length) } else { (TypeTokenKind::Identifier, length) }
581 }
582
583 #[inline]
584 fn token(&self, kind: TypeTokenKind, value: &'input [u8], from: Position, to: Position) -> TypeToken<'input> {
585 let mut value_chunks = value.utf8_chunks();
586 let value_str = if let Some(chunk) = value_chunks.next() {
587 let valid = chunk.valid();
588
589 debug_assert_eq!(valid.len(), value.len());
590
591 valid
592 } else {
593 ""
594 };
595
596 TypeToken { kind, value: value_str, span: Span::new(self.file_id(), from, to) }
597 }
598}
599
600impl HasFileId for TypeLexer<'_> {
601 fn file_id(&self) -> FileId {
602 self.input.file_id()
603 }
604}