1use mago_database::file::FileId;
2use mago_database::file::HasFileId;
3use mago_span::Position;
4use mago_span::Span;
5use mago_syntax_core::float_exponent;
6use mago_syntax_core::float_separator;
7use mago_syntax_core::input::Input;
8use mago_syntax_core::number_sign;
9use mago_syntax_core::part_of_identifier;
10use mago_syntax_core::start_of_binary_number;
11use mago_syntax_core::start_of_float_number;
12use mago_syntax_core::start_of_hexadecimal_number;
13use mago_syntax_core::start_of_identifier;
14use mago_syntax_core::start_of_number;
15use mago_syntax_core::start_of_octal_number;
16use mago_syntax_core::start_of_octal_or_float_number;
17use mago_syntax_core::utils::read_digits_of_base;
18
19use crate::error::SyntaxError;
20use crate::token::TypeToken;
21use crate::token::TypeTokenKind;
22
23#[derive(Debug)]
24pub struct TypeLexer<'input> {
25 input: Input<'input>,
26}
27
28impl<'input> TypeLexer<'input> {
29 #[must_use]
30 pub fn new(input: Input<'input>) -> TypeLexer<'input> {
31 TypeLexer { input }
32 }
33
34 #[must_use]
35 pub fn has_reached_eof(&self) -> bool {
36 self.input.has_reached_eof()
37 }
38
39 #[must_use]
40 pub fn current_position(&self) -> Position {
41 self.input.current_position()
42 }
43
44 #[inline]
54 #[must_use]
55 pub fn slice_in_range(&self, from: u32, to: u32) -> &'input str {
56 let bytes_slice = self.input.slice_in_range(from, to);
57
58 bytes_slice.utf8_chunks().next().map_or("", |chunk| chunk.valid())
60 }
61
62 #[inline]
63 pub fn advance(&mut self) -> Option<Result<TypeToken<'input>, SyntaxError>> {
64 if self.input.has_reached_eof() {
65 return None;
66 }
67
68 let start = self.input.current_position();
69 let whitespaces = self.input.consume_whitespaces();
70 if !whitespaces.is_empty() {
71 let end = self.input.current_position();
72
73 return Some(Ok(self.token(TypeTokenKind::Whitespace, whitespaces, start, end)));
74 }
75
76 let (kind, length) = match self.input.read(3) {
77 [b'*', ..] => (TypeTokenKind::Asterisk, 1),
78 [b'n' | b'N', b'o' | b'O', b'n' | b'N'] => {
79 if self.input.is_at(b"non-positive-int", true) {
80 (TypeTokenKind::NonPositiveInt, 16)
81 } else if self.input.is_at(b"non-negative-int", true) {
82 (TypeTokenKind::NonNegativeInt, 16)
83 } else if self.input.is_at(b"non-empty-literal-string", true) {
84 (TypeTokenKind::NonEmptyUnspecifiedLiteralString, 26)
85 } else if self.input.is_at(b"non-empty-string", true) {
86 (TypeTokenKind::NonEmptyString, 16)
87 } else if self.input.is_at(b"non-empty-array", true) {
88 (TypeTokenKind::NonEmptyArray, 15)
89 } else if self.input.is_at(b"non-empty-list", true) {
90 (TypeTokenKind::NonEmptyList, 14)
91 } else if self.input.is_at(b"non-falsy-string", true) {
92 (TypeTokenKind::NonFalsyString, 16)
93 } else if self.input.is_at(b"non-empty-lowercase-string", true) {
94 (TypeTokenKind::NonEmptyLowercaseString, 26)
95 } else if self.input.is_at(b"non-empty-mixed", true) {
96 (TypeTokenKind::NonEmptyMixed, 15)
97 } else {
98 self.read_identifier()
99 }
100 }
101 [b'p' | b'P', b'u' | b'U', b'r' | b'R'] => {
102 if self.input.is_at(b"pure-closure", true) {
103 (TypeTokenKind::PureClosure, 12)
104 } else if self.input.is_at(b"pure-callable", true) {
105 (TypeTokenKind::PureCallable, 13)
106 } else {
107 self.read_identifier()
108 }
109 }
110 [b'n' | b'N', b'e' | b'E', b'v' | b'V'] => {
111 if self.input.is_at(b"never-return", true) {
112 (TypeTokenKind::NeverReturn, 12)
113 } else if self.input.is_at(b"never-returns", true) {
114 (TypeTokenKind::NeverReturns, 13)
115 } else {
116 self.read_identifier()
117 }
118 }
119 [b't' | b'T', b'r' | b'R', b'u' | b'U'] => {
120 if self.input.is_at(b"truthy-string", true) {
121 (TypeTokenKind::TruthyString, 13)
122 } else {
123 self.read_identifier()
124 }
125 }
126 [b't' | b'T', b'r' | b'R', b'a' | b'A'] => {
127 if self.input.is_at(b"trait-string", true) {
128 (TypeTokenKind::TraitString, 12)
129 } else {
130 self.read_identifier()
131 }
132 }
133 [b'a' | b'A', b's' | b'S', b's' | b'S'] => {
134 if self.input.is_at(b"associative-array", true) {
135 (TypeTokenKind::AssociativeArray, 17)
136 } else {
137 self.read_identifier()
138 }
139 }
140 [b'c' | b'C', b'l' | b'L', b'a' | b'A'] => {
141 if self.input.is_at(b"class-string", true) {
142 (TypeTokenKind::ClassString, 12)
143 } else {
144 self.read_identifier()
145 }
146 }
147 [b'e' | b'E', b'n' | b'N', b'u' | b'U'] => {
148 if self.input.is_at(b"enum-string", true) {
149 (TypeTokenKind::EnumString, 11)
150 } else {
151 self.read_identifier()
152 }
153 }
154 [b'i' | b'I', b'n' | b'N', b't' | b'T'] => {
155 if self.input.is_at(b"interface-string", true) {
156 (TypeTokenKind::InterfaceString, 16)
157 } else if self.input.is_at(b"int-mask-of", true) {
158 (TypeTokenKind::IntMaskOf, 11)
159 } else if self.input.is_at(b"int-mask", true) {
160 (TypeTokenKind::IntMask, 8)
161 } else {
162 self.read_identifier()
163 }
164 }
165 [b'c' | b'C', b'l' | b'L', b'o' | b'O'] => {
166 if self.input.is_at(b"closed-resource", true) {
167 (TypeTokenKind::ClosedResource, 15)
168 } else {
169 self.read_identifier()
170 }
171 }
172 [b's' | b'S', b't' | b'T', b'r' | b'R'] => {
173 if self.input.is_at(b"stringable-object", true) {
174 (TypeTokenKind::StringableObject, 17)
175 } else {
176 self.read_identifier()
177 }
178 }
179 [b'n' | b'N', b'u' | b'U', b'm' | b'M'] => {
180 if self.input.is_at(b"numeric-string", true) {
181 (TypeTokenKind::NumericString, 14)
182 } else {
183 self.read_identifier()
184 }
185 }
186 [b'l' | b'L', b'i' | b'I', b't' | b'T'] => {
187 if self.input.is_at(b"literal-string", true) {
188 (TypeTokenKind::UnspecifiedLiteralString, 14)
189 } else if self.input.is_at(b"literal-int", true) {
190 (TypeTokenKind::UnspecifiedLiteralInt, 11)
191 } else if self.input.is_at(b"literal-float", true) {
192 (TypeTokenKind::UnspecifiedLiteralFloat, 13)
193 } else {
194 self.read_identifier()
195 }
196 }
197 [b'l' | b'L', b'o' | b'O', b'w' | b'W'] => {
198 if self.input.is_at(b"lowercase-string", true) {
199 (TypeTokenKind::LowercaseString, 16)
200 } else {
201 self.read_identifier()
202 }
203 }
204 [b'o' | b'O', b'p' | b'P', b'e' | b'E'] => {
205 if self.input.is_at(b"open-resource", true) {
206 (TypeTokenKind::OpenResource, 13)
207 } else {
208 self.read_identifier()
209 }
210 }
211 [b'a' | b'A', b'r' | b'R', b'r' | b'R'] => {
212 if self.input.is_at(b"array-key", true) {
213 (TypeTokenKind::ArrayKey, 9)
214 } else {
215 self.read_identifier()
216 }
217 }
218 [b'n' | b'N', b'o' | b'O', b'-'] => {
219 if self.input.is_at(b"no-return", true) {
220 (TypeTokenKind::NoReturn, 9)
221 } else {
222 self.read_identifier()
223 }
224 }
225 [b'v' | b'V', b'a' | b'A', b'l' | b'L'] => {
226 if self.input.is_at(b"value-of", true) {
227 (TypeTokenKind::ValueOf, 8)
228 } else {
229 self.read_identifier()
230 }
231 }
232 [b'k' | b'K', b'e' | b'E', b'y' | b'Y'] => {
233 if self.input.is_at(b"key-of", true) {
234 (TypeTokenKind::KeyOf, 6)
235 } else {
236 self.read_identifier()
237 }
238 }
239 [b'p' | b'P', b'r' | b'R', b'o' | b'O'] => {
240 if self.input.is_at(b"protected-properties-of", true) {
241 (TypeTokenKind::ProtectedPropertiesOf, 23)
242 } else if self.input.is_at(b"properties-of", true) {
243 (TypeTokenKind::PropertiesOf, 13)
244 } else {
245 self.read_identifier()
246 }
247 }
248 [b'p' | b'P', b'u' | b'U', b'b' | b'B'] => {
249 if self.input.is_at(b"public-properties-of", true) {
250 (TypeTokenKind::PublicPropertiesOf, 20)
251 } else {
252 self.read_identifier()
253 }
254 }
255 [b'p' | b'P', b'r' | b'R', b'i' | b'I'] => {
256 if self.input.is_at(b"private-properties-of", true) {
257 (TypeTokenKind::PrivatePropertiesOf, 21)
258 } else {
259 self.read_identifier()
260 }
261 }
262 [b'p' | b'P', b'o' | b'O', b's' | b'S'] => {
263 if self.input.is_at(b"positive-int", true) {
264 (TypeTokenKind::PositiveInt, 12)
265 } else {
266 self.read_identifier()
267 }
268 }
269 [b'n' | b'N', b'e' | b'E', b'g' | b'G'] => {
270 if self.input.is_at(b"negative-int", true) {
271 (TypeTokenKind::NegativeInt, 12)
272 } else {
273 self.read_identifier()
274 }
275 }
276 [b'.', b'.', b'.'] => (TypeTokenKind::Ellipsis, 3),
277 [b':', b':', ..] => (TypeTokenKind::ColonColon, 2),
278 [b'/', b'/', ..] => self.read_single_line_comment(),
279 [b'.', start_of_number!(), ..] => self.read_decimal(),
280 [start_of_number!(), ..] => self.read_number(),
281 [quote @ (b'\'' | b'"'), ..] => self.read_literal_string(*quote),
282 [b'\\', start_of_identifier!(), ..] => self.read_fully_qualified_identifier(),
283 [start_of_identifier!(), ..] => self.read_identifier(),
284 [b'$', start_of_identifier!(), ..] => {
285 let mut length = 2;
286 while let [part_of_identifier!(), ..] = self.input.peek(length, 1) {
287 length += 1;
288 }
289
290 (TypeTokenKind::Variable, length)
291 }
292 [b':', ..] => (TypeTokenKind::Colon, 1),
293 [b'=', ..] => (TypeTokenKind::Equals, 1),
294 [b'?', ..] => (TypeTokenKind::Question, 1),
295 [b'!', ..] => (TypeTokenKind::Exclamation, 1),
296 [b'&', ..] => (TypeTokenKind::Ampersand, 1),
297 [b'|', ..] => (TypeTokenKind::Pipe, 1),
298 [b'>', ..] => (TypeTokenKind::GreaterThan, 1),
299 [b'<', ..] => (TypeTokenKind::LessThan, 1),
300 [b'(', ..] => (TypeTokenKind::LeftParenthesis, 1),
301 [b')', ..] => (TypeTokenKind::RightParenthesis, 1),
302 [b'[', ..] => (TypeTokenKind::LeftBracket, 1),
303 [b']', ..] => (TypeTokenKind::RightBracket, 1),
304 [b'{', ..] => (TypeTokenKind::LeftBrace, 1),
305 [b'}', ..] => (TypeTokenKind::RightBrace, 1),
306 [b',', ..] => (TypeTokenKind::Comma, 1),
307 [b'+', ..] => (TypeTokenKind::Plus, 1),
308 [b'-', ..] => (TypeTokenKind::Minus, 1),
309 [unknown_byte, ..] => {
310 return Some(Err(SyntaxError::UnrecognizedToken(
311 self.file_id(),
312 *unknown_byte,
313 self.input.current_position(),
314 )));
315 }
316 [] => {
317 unreachable!()
318 }
319 };
320
321 let buffer = self.input.consume(length);
322 let end = self.input.current_position();
323
324 Some(Ok(self.token(kind, buffer, start, end)))
325 }
326
327 fn read_single_line_comment(&self) -> (TypeTokenKind, usize) {
328 let mut length = 2;
329 loop {
330 match self.input.peek(length, 1) {
331 [b'\n', ..] | [] => {
332 break;
333 }
334 [_, ..] => {
335 length += 1;
336 }
337 }
338 }
339
340 (TypeTokenKind::SingleLineComment, length)
341 }
342
343 fn read_decimal(&self) -> (TypeTokenKind, usize) {
344 let mut length = read_digits_of_base(&self.input, 2, 10);
345 if let float_exponent!() = self.input.peek(length, 1) {
346 length += 1;
347 if let number_sign!() = self.input.peek(length, 1) {
348 length += 1;
349 }
350
351 length = read_digits_of_base(&self.input, length, 10);
352 }
353
354 (TypeTokenKind::LiteralFloat, length)
355 }
356
357 fn read_number(&self) -> (TypeTokenKind, usize) {
358 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
359 pub enum NumberKind {
360 Integer,
361 Float,
362 OctalOrFloat,
363 IntegerOrFloat,
364 }
365
366 let mut length = 1;
367
368 let (base, kind): (u8, NumberKind) = match self.input.read(3) {
369 start_of_binary_number!() => {
370 length += 1;
371
372 (2, NumberKind::Integer)
373 }
374 start_of_octal_number!() => {
375 length += 1;
376
377 (8, NumberKind::Integer)
378 }
379 start_of_hexadecimal_number!() => {
380 length += 1;
381
382 (16, NumberKind::Integer)
383 }
384 start_of_octal_or_float_number!() => (10, NumberKind::OctalOrFloat),
385 start_of_float_number!() => (10, NumberKind::Float),
386 _ => (10, NumberKind::IntegerOrFloat),
387 };
388
389 if kind != NumberKind::Float {
390 length = read_digits_of_base(&self.input, length, base);
391
392 if kind == NumberKind::Integer {
393 return (TypeTokenKind::LiteralInteger, length);
394 }
395 }
396
397 let is_float = matches!(self.input.peek(length, 3), float_separator!());
398
399 if !is_float {
400 return (TypeTokenKind::LiteralInteger, length);
401 }
402
403 if let [b'.'] = self.input.peek(length, 1) {
404 length += 1;
405 length = read_digits_of_base(&self.input, length, 10);
406 }
407
408 if let float_exponent!() = self.input.peek(length, 1) {
409 length += 1;
410 if let number_sign!() = self.input.peek(length, 1) {
411 length += 1;
412 }
413
414 length = read_digits_of_base(&self.input, length, 10);
415 }
416
417 (TypeTokenKind::LiteralFloat, length)
418 }
419
420 fn read_literal_string(&self, quote: u8) -> (TypeTokenKind, usize) {
421 let total = self.input.len();
422 let start = self.input.current_offset();
423 let mut length = 1; let mut last_was_backslash = false;
425 let mut partial = false;
426
427 loop {
428 let pos = start + length;
429 if pos >= total {
430 partial = true;
432 break;
433 }
434
435 let byte = self.input.read_at(pos);
436 if matches!(byte, b'\\') {
437 last_was_backslash = !last_was_backslash;
439 length += 1;
440 } else {
441 if byte == "e && !last_was_backslash {
443 length += 1; break;
445 }
446
447 length += 1;
448 last_was_backslash = false;
449 }
450 }
451
452 if partial { (TypeTokenKind::PartialLiteralString, length) } else { (TypeTokenKind::LiteralString, length) }
453 }
454
455 fn read_fully_qualified_identifier(&self) -> (TypeTokenKind, usize) {
456 let mut length = 2;
457 let mut last_was_slash = false;
458 loop {
459 match self.input.peek(length, 1) {
460 [start_of_identifier!(), ..] if last_was_slash => {
461 length += 1;
462 last_was_slash = false;
463 }
464 [part_of_identifier!(), ..] if !last_was_slash => {
465 length += 1;
466 }
467 [b'\\', ..] => {
468 if last_was_slash {
469 length -= 1;
470
471 break;
472 }
473
474 length += 1;
475 last_was_slash = true;
476 }
477 _ => {
478 break;
479 }
480 }
481 }
482
483 (TypeTokenKind::FullyQualifiedIdentifier, length)
484 }
485
486 fn read_identifier(&self) -> (TypeTokenKind, usize) {
487 const KEYWORD_TYPES: [(&[u8], TypeTokenKind); 28] = [
488 (b"list", TypeTokenKind::List),
489 (b"int", TypeTokenKind::Int),
490 (b"integer", TypeTokenKind::Integer),
491 (b"string", TypeTokenKind::String),
492 (b"float", TypeTokenKind::Float),
493 (b"double", TypeTokenKind::Double),
494 (b"real", TypeTokenKind::Real),
495 (b"bool", TypeTokenKind::Bool),
496 (b"boolean", TypeTokenKind::Boolean),
497 (b"false", TypeTokenKind::False),
498 (b"true", TypeTokenKind::True),
499 (b"object", TypeTokenKind::Object),
500 (b"callable", TypeTokenKind::Callable),
501 (b"array", TypeTokenKind::Array),
502 (b"iterable", TypeTokenKind::Iterable),
503 (b"null", TypeTokenKind::Null),
504 (b"mixed", TypeTokenKind::Mixed),
505 (b"resource", TypeTokenKind::Resource),
506 (b"void", TypeTokenKind::Void),
507 (b"scalar", TypeTokenKind::Scalar),
508 (b"numeric", TypeTokenKind::Numeric),
509 (b"never", TypeTokenKind::Never),
510 (b"nothing", TypeTokenKind::Nothing),
511 (b"as", TypeTokenKind::As),
512 (b"is", TypeTokenKind::Is),
513 (b"not", TypeTokenKind::Not),
514 (b"min", TypeTokenKind::Min),
515 (b"max", TypeTokenKind::Max),
516 ];
517
518 let mut length = 1;
519 let mut ended_with_slash = false;
520 loop {
521 match self.input.peek(length, 2) {
522 [part_of_identifier!(), ..] => {
523 length += 1;
524 }
525 [b'\\', start_of_identifier!(), ..] => {
526 ended_with_slash = true;
527 break;
528 }
529 _ => {
530 break;
531 }
532 }
533 }
534
535 if !ended_with_slash {
536 for (value, kind) in KEYWORD_TYPES {
537 let keyword_length = value.len();
538 if keyword_length != length {
539 continue;
540 }
541
542 if self.input.is_at(value, true) {
543 return (kind, keyword_length);
544 }
545 }
546 }
547
548 let mut slashes = 0;
549 let mut last_was_slash = false;
550 loop {
551 match self.input.peek(length, 1) {
552 [start_of_identifier!(), ..] if last_was_slash => {
553 length += 1;
554 last_was_slash = false;
555 }
556 [part_of_identifier!(), ..] if !last_was_slash => {
557 length += 1;
558 }
559 [b'\\', ..] => {
560 if last_was_slash {
561 length -= 1;
562 slashes -= 1;
563 last_was_slash = false;
564
565 break;
566 }
567 length += 1;
568 slashes += 1;
569 last_was_slash = true;
570 }
571 _ => {
572 break;
573 }
574 }
575 }
576
577 if last_was_slash {
578 length -= 1;
579 slashes -= 1;
580 }
581
582 if slashes > 0 { (TypeTokenKind::QualifiedIdentifier, length) } else { (TypeTokenKind::Identifier, length) }
583 }
584
585 #[inline]
586 fn token(&self, kind: TypeTokenKind, value: &'input [u8], from: Position, to: Position) -> TypeToken<'input> {
587 let mut value_chunks = value.utf8_chunks();
588 let value_str = if let Some(chunk) = value_chunks.next() {
589 let valid = chunk.valid();
590
591 debug_assert_eq!(valid.len(), value.len());
592
593 valid
594 } else {
595 ""
596 };
597
598 TypeToken { kind, value: value_str, span: Span::new(self.file_id(), from, to) }
599 }
600}
601
602impl HasFileId for TypeLexer<'_> {
603 fn file_id(&self) -> FileId {
604 self.input.file_id()
605 }
606}