mago_syntax_core/input.rs
1use memchr::memchr;
2use memchr::memmem::find;
3
4use mago_database::file::File;
5use mago_database::file::FileId;
6use mago_span::Position;
7
8/// A struct representing the input code being lexed.
9///
10/// The `Input` struct provides methods to read, peek, consume, and skip characters
11/// from the bytes input code while keeping track of the current position (line, column, offset).
12#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
13pub struct Input<'a> {
14 pub(crate) bytes: &'a [u8],
15 pub(crate) length: usize,
16 pub(crate) offset: usize,
17 pub(crate) starting_position: Position,
18}
19
20impl<'a> Input<'a> {
21 /// Creates a new `Input` instance from the given input.
22 ///
23 /// # Arguments
24 ///
25 /// * `file_id` - The unique identifier for the source file this input belongs to.
26 /// * `bytes` - A byte slice representing the input code to be lexed.
27 ///
28 /// # Returns
29 ///
30 /// A new `Input` instance initialized at the beginning of the input.
31 pub fn new(file_id: FileId, bytes: &'a [u8]) -> Self {
32 let length = bytes.len();
33
34 Self { bytes, length, offset: 0, starting_position: Position::start_of(file_id) }
35 }
36
37 /// Creates a new `Input` instance from the contents of a `File`.
38 ///
39 /// # Arguments
40 ///
41 /// * `file` - A reference to the `File` containing the source code.
42 ///
43 /// # Returns
44 ///
45 /// A new `Input` instance initialized with the file's ID and contents.
46 pub fn from_file(file: &'a File) -> Self {
47 Self::new(file.id, file.contents.as_bytes())
48 }
49
50 /// Creates a new `Input` instance representing a byte slice that is
51 /// "anchored" at a specific absolute position within a larger source file.
52 ///
53 /// This is useful when lexing a subset (slice) of a source file, as it allows
54 /// generated tokens to retain accurate absolute positions and spans relative
55 /// to the original file.
56 ///
57 /// The internal cursor (`offset`) starts at 0 relative to the `bytes` slice,
58 /// but the absolute position is calculated relative to the `anchor_position`.
59 ///
60 /// # Arguments
61 ///
62 /// * `bytes` - A byte slice representing the input code subset to be lexed.
63 /// * `anchor_position` - The absolute `Position` in the original source file where
64 /// the provided `bytes` slice begins.
65 ///
66 /// # Returns
67 ///
68 /// A new `Input` instance ready to lex the `bytes`, maintaining positions
69 /// relative to `anchor_position`.
70 pub fn anchored_at(bytes: &'a [u8], anchor_position: Position) -> Self {
71 let length = bytes.len();
72
73 Self { bytes, length, offset: 0, starting_position: anchor_position }
74 }
75
76 /// Returns the source file identifier of the input code.
77 #[inline]
78 pub const fn file_id(&self) -> FileId {
79 self.starting_position.file_id
80 }
81
82 /// Returns the absolute current `Position` of the lexer within the original source file.
83 ///
84 /// It calculates this by adding the internal offset (progress within the current byte slice)
85 /// to the `starting_position` the `Input` was initialized with.
86 #[inline]
87 pub const fn current_position(&self) -> Position {
88 // Calculate absolute position by adding internal offset to the starting base
89 self.starting_position.forward(self.offset)
90 }
91
92 /// Returns the current internal byte offset relative to the start of the input slice.
93 ///
94 /// This indicates how many bytes have been consumed from the current `bytes` slice.
95 /// To get the absolute position in the original source file, use `current_position()`.
96 #[inline]
97 pub const fn current_offset(&self) -> usize {
98 self.offset
99 }
100
101 /// Returns `true` if the input slice is empty (length is zero).
102 #[inline]
103 pub const fn is_empty(&self) -> bool {
104 self.length == 0
105 }
106
107 /// Returns the total length in bytes of the input slice being processed.
108 #[inline]
109 pub const fn len(&self) -> usize {
110 self.length
111 }
112
113 /// Checks if the current position is at the end of the input.
114 ///
115 /// # Returns
116 ///
117 /// `true` if the current offset is greater than or equal to the input length; `false` otherwise.
118 #[inline]
119 pub const fn has_reached_eof(&self) -> bool {
120 self.offset >= self.length
121 }
122
123 /// Returns a byte slice within a specified absolute range.
124 ///
125 /// The `from` and `to` arguments are absolute byte offsets from the beginning
126 /// of the original source file. The method calculates the correct slice
127 /// relative to the `starting_position` of this `Input`.
128 ///
129 /// This is useful for retrieving the raw text of a `Span` or `Token` whose
130 /// positions are absolute, even when the `Input` only contains a subsection
131 /// of the source file.
132 ///
133 /// The returned slice is defensively clamped to the bounds of the current
134 /// `Input`'s byte slice to prevent panics.
135 ///
136 /// # Arguments
137 ///
138 /// * `from` - The absolute starting byte offset.
139 /// * `to` - The absolute ending byte offset (exclusive).
140 ///
141 /// # Returns
142 ///
143 /// A byte slice `&[u8]` corresponding to the requested range.
144 #[inline]
145 pub fn slice_in_range(&self, from: usize, to: usize) -> &'a [u8] {
146 let base_offset = self.starting_position.offset;
147
148 // Calculate the start and end positions relative to the local `bytes` slice.
149 // `saturating_sub` prevents underflow if `from`/`to` are smaller than `base_offset`.
150 let local_from = from.saturating_sub(base_offset);
151 let local_to = to.saturating_sub(base_offset);
152
153 // Clamp the local indices to the actual length of the `bytes` slice to prevent panics.
154 let start = local_from.min(self.length);
155 let end = local_to.min(self.length);
156
157 // Ensure the start index is not greater than the end index.
158 if start >= end {
159 return &[];
160 }
161
162 // If the start index is beyond the length of the input, return an empty slice.
163 if start >= self.length {
164 return &[];
165 }
166
167 &self.bytes[start..end]
168 }
169
170 /// Advances the current position by one character, updating line and column numbers.
171 ///
172 /// Handles different line endings (`\n`, `\r`, `\r\n`) and updates line and column counters accordingly.
173 ///
174 /// If the end of input is reached, no action is taken.
175 #[inline]
176 pub fn next(&mut self) {
177 if !self.has_reached_eof() {
178 self.offset += 1;
179 }
180 }
181
182 /// Skips the next `count` characters, advancing the position accordingly.
183 ///
184 /// Updates line and column numbers as it advances.
185 ///
186 /// # Arguments
187 ///
188 /// * `count` - The number of characters to skip.
189 #[inline]
190 pub fn skip(&mut self, count: usize) {
191 self.offset = (self.offset + count).min(self.length);
192 }
193
194 /// Consumes the next `count` characters and returns them as a slice.
195 ///
196 /// Advances the position by `count` characters.
197 ///
198 /// # Arguments
199 ///
200 /// * `count` - The number of characters to consume.
201 ///
202 /// # Returns
203 ///
204 /// A byte slice containing the consumed characters.
205 #[inline]
206 pub fn consume(&mut self, count: usize) -> &'a [u8] {
207 let (from, until) = self.calculate_bound(count);
208
209 self.skip(count);
210
211 &self.bytes[from..until]
212 }
213
214 /// Consumes all remaining characters from the current position to the end of input.
215 ///
216 /// Advances the position to EOF.
217 ///
218 /// # Returns
219 ///
220 /// A byte slice containing the remaining characters.
221 #[inline]
222 pub fn consume_remaining(&mut self) -> &'a [u8] {
223 if self.has_reached_eof() {
224 return &[];
225 }
226
227 let from = self.offset;
228 self.offset = self.length;
229
230 &self.bytes[from..]
231 }
232
233 /// Consumes characters until the given byte slice is found.
234 ///
235 /// Advances the position to the start of the search slice if found,
236 /// or to EOF if not found.
237 ///
238 /// # Arguments
239 ///
240 /// * `search` - The byte slice to search for.
241 /// * `ignore_ascii_case` - Whether to ignore ASCII case when comparing characters.
242 ///
243 /// # Returns
244 ///
245 /// A byte slice containing the consumed characters.
246 #[inline]
247 pub fn consume_until(&mut self, search: &[u8], ignore_ascii_case: bool) -> &'a [u8] {
248 let start = self.offset;
249 if !ignore_ascii_case {
250 // For a single-byte search, use memchr.
251 if search.len() == 1 {
252 if let Some(pos) = memchr(search[0], &self.bytes[self.offset..]) {
253 self.offset += pos;
254 &self.bytes[start..self.offset]
255 } else {
256 self.offset = self.length;
257 &self.bytes[start..self.length]
258 }
259 } else if let Some(pos) = find(&self.bytes[self.offset..], search) {
260 self.offset += pos;
261 &self.bytes[start..self.offset]
262 } else {
263 self.offset = self.length;
264 &self.bytes[start..self.length]
265 }
266 } else {
267 while !self.has_reached_eof() && !self.is_at(search, ignore_ascii_case) {
268 self.offset += 1;
269 }
270
271 &self.bytes[start..self.offset]
272 }
273 }
274
275 #[inline]
276 pub fn consume_through(&mut self, search: u8) -> &'a [u8] {
277 let start = self.offset;
278 if let Some(pos) = memchr::memchr(search, &self.bytes[self.offset..]) {
279 self.offset += pos + 1;
280
281 &self.bytes[start..self.offset]
282 } else {
283 self.offset = self.length;
284
285 &self.bytes[start..self.length]
286 }
287 }
288
289 /// Consumes whitespaces until a non-whitespace character is found.
290 ///
291 /// # Returns
292 ///
293 /// A byte slice containing the consumed whitespaces.
294 #[inline]
295 pub fn consume_whitespaces(&mut self) -> &'a [u8] {
296 let start = self.offset;
297 let bytes = self.bytes;
298 let len = self.length;
299 while self.offset < len && bytes[self.offset].is_ascii_whitespace() {
300 self.offset += 1;
301 }
302
303 &bytes[start..self.offset]
304 }
305
306 /// Reads the next `n` characters without advancing the position.
307 ///
308 /// # Arguments
309 ///
310 /// * `n` - The number of characters to read.
311 ///
312 /// # Returns
313 ///
314 /// A byte slice containing the next `n` characters.
315 #[inline]
316 pub fn read(&self, n: usize) -> &'a [u8] {
317 let (from, until) = self.calculate_bound(n);
318
319 &self.bytes[from..until]
320 }
321
322 /// Reads a single byte at a specific byte offset within the input slice,
323 /// without advancing the internal cursor.
324 ///
325 /// This provides direct, low-level access to the underlying byte data.
326 ///
327 /// # Arguments
328 ///
329 /// * `at` - The zero-based byte offset within the input slice (`self.bytes`)
330 /// from which to read the byte.
331 ///
332 /// # Returns
333 ///
334 /// A reference to the byte located at the specified offset `at`.
335 ///
336 /// # Panics
337 ///
338 /// This method **panics** if the provided `at` offset is out of bounds
339 /// for the input byte slice (i.e., if `at >= self.bytes.len()`).
340 pub fn read_at(&self, at: usize) -> &'a u8 {
341 &self.bytes[at]
342 }
343
344 /// Checks if the input at the current position matches the given byte slice.
345 ///
346 /// # Arguments
347 ///
348 /// * `search` - The byte slice to compare against the input.
349 /// * `ignore_ascii_case` - Whether to ignore ASCII case when comparing.
350 ///
351 /// # Returns
352 ///
353 /// `true` if the next bytes match `search`; `false` otherwise.
354 #[inline]
355 pub fn is_at(&self, search: &[u8], ignore_ascii_case: bool) -> bool {
356 let (from, until) = self.calculate_bound(search.len());
357 let slice = &self.bytes[from..until];
358
359 if ignore_ascii_case { slice.eq_ignore_ascii_case(search) } else { slice == search }
360 }
361
362 /// Attempts to match the given byte sequence at the current position, ignoring whitespace in the input.
363 ///
364 /// This method tries to match the provided byte slice `search` against the input starting
365 /// from the current position, possibly ignoring ASCII case. Whitespace characters in the input
366 /// are skipped during matching, but their length is included in the returned length.
367 ///
368 /// Importantly, the method **does not include** any trailing whitespace **after** the matched sequence
369 /// in the returned length.
370 ///
371 /// For example, to match the sequence `(string)`, the input could be `(string)`, `( string )`, `( string )`, etc.,
372 /// and this method would return the total length of the input consumed to match `(string)`,
373 /// including any whitespace within the matched sequence, but **excluding** any whitespace after it.
374 ///
375 /// # Arguments
376 ///
377 /// * `search` - The byte slice to match against the input.
378 /// * `ignore_ascii_case` - If `true`, ASCII case is ignored during comparison.
379 ///
380 /// # Returns
381 ///
382 /// * `Some(length)` - If the input matches `search` (ignoring whitespace within the sequence), returns the total length
383 /// of the input consumed to match `search`, including any skipped whitespace **within** the matched sequence.
384 /// * `None` - If the input does not match `search`.
385 #[inline]
386 pub const fn match_sequence_ignore_whitespace(&self, search: &[u8], ignore_ascii_case: bool) -> Option<usize> {
387 let mut offset = self.offset;
388 let mut search_offset = 0;
389 let mut length = 0;
390 let bytes = self.bytes;
391 let total = self.length;
392 while search_offset < search.len() {
393 // Skip whitespace in the input.
394 while offset < total && bytes[offset].is_ascii_whitespace() {
395 offset += 1;
396 length += 1;
397 }
398
399 if offset >= total {
400 return None;
401 }
402
403 let input_byte = bytes[offset];
404 let search_byte = search[search_offset];
405 let matched = if ignore_ascii_case {
406 input_byte.eq_ignore_ascii_case(&search_byte)
407 } else {
408 input_byte == search_byte
409 };
410
411 if matched {
412 offset += 1;
413 length += 1;
414 search_offset += 1;
415 } else {
416 return None;
417 }
418 }
419
420 Some(length)
421 }
422
423 /// Peeks ahead `i` characters and reads the next `n` characters without advancing the position.
424 ///
425 /// # Arguments
426 ///
427 /// * `offset` - The number of characters to skip before reading.
428 /// * `n` - The number of characters to read after skipping.
429 ///
430 /// # Returns
431 ///
432 /// A byte slice containing the peeked characters.
433 #[inline]
434 pub fn peek(&self, offset: usize, n: usize) -> &'a [u8] {
435 let from = self.offset + offset;
436 if from >= self.length {
437 return &self.bytes[self.length..self.length];
438 }
439
440 let mut until = from + n;
441 if until >= self.length {
442 until = self.length;
443 }
444
445 &self.bytes[from..until]
446 }
447
448 /// Calculates the bounds for slicing the input safely.
449 ///
450 /// Ensures that slicing does not go beyond the input length.
451 ///
452 /// # Arguments
453 ///
454 /// * `n` - The number of characters to include in the slice.
455 ///
456 /// # Returns
457 ///
458 /// A tuple `(from, until)` representing the start and end indices for slicing.
459 #[inline]
460 const fn calculate_bound(&self, n: usize) -> (usize, usize) {
461 if self.has_reached_eof() {
462 return (self.length, self.length);
463 }
464
465 let mut until = self.offset + n;
466
467 if until >= self.length {
468 until = self.length;
469 }
470
471 (self.offset, until)
472 }
473}
474
475#[cfg(test)]
476mod tests {
477 use mago_span::Position;
478
479 use super::*;
480
481 #[test]
482 fn test_new() {
483 let bytes = b"Hello, world!";
484 let input = Input::new(FileId::zero(), bytes);
485
486 assert_eq!(input.current_position(), Position::new(FileId::zero(), 0));
487 assert_eq!(input.length, bytes.len());
488 assert_eq!(input.bytes, bytes);
489 }
490
491 #[test]
492 fn test_is_eof() {
493 let bytes = b"";
494 let input = Input::new(FileId::zero(), bytes);
495
496 assert!(input.has_reached_eof());
497
498 let bytes = b"data";
499 let mut input = Input::new(FileId::zero(), bytes);
500
501 assert!(!input.has_reached_eof());
502
503 input.skip(4);
504
505 assert!(input.has_reached_eof());
506 }
507
508 #[test]
509 fn test_next() {
510 let bytes = b"a\nb\r\nc\rd";
511 let mut input = Input::new(FileId::zero(), bytes);
512
513 // 'a'
514 input.next();
515 assert_eq!(input.current_position(), Position::new(FileId::zero(), 1));
516
517 // '\n'
518 input.next();
519 assert_eq!(input.current_position(), Position::new(FileId::zero(), 2));
520
521 // 'b'
522 input.next();
523 assert_eq!(input.current_position(), Position::new(FileId::zero(), 3));
524
525 // '\r\n' should be treated as one newline
526 input.next();
527 assert_eq!(input.current_position(), Position::new(FileId::zero(), 4));
528
529 // 'c'
530 input.next();
531 assert_eq!(input.current_position(), Position::new(FileId::zero(), 5));
532
533 // '\r'
534 input.next();
535 assert_eq!(input.current_position(), Position::new(FileId::zero(), 6));
536
537 // 'd'
538 input.next();
539 assert_eq!(input.current_position(), Position::new(FileId::zero(), 7));
540 }
541
542 #[test]
543 fn test_consume() {
544 let bytes = b"abcdef";
545 let mut input = Input::new(FileId::zero(), bytes);
546
547 let consumed = input.consume(3);
548 assert_eq!(consumed, b"abc");
549 assert_eq!(input.current_position(), Position::new(FileId::zero(), 3));
550
551 let consumed = input.consume(3);
552 assert_eq!(consumed, b"def");
553 assert_eq!(input.current_position(), Position::new(FileId::zero(), 6));
554
555 let consumed = input.consume(1); // Should return empty slice at EOF
556 assert_eq!(consumed, b"");
557 assert!(input.has_reached_eof());
558 }
559
560 #[test]
561 fn test_consume_remaining() {
562 let bytes = b"abcdef";
563 let mut input = Input::new(FileId::zero(), bytes);
564
565 input.skip(2);
566 let remaining = input.consume_remaining();
567 assert_eq!(remaining, b"cdef");
568 assert!(input.has_reached_eof());
569 }
570
571 #[test]
572 fn test_read() {
573 let bytes = b"abcdef";
574 let input = Input::new(FileId::zero(), bytes);
575
576 let read = input.read(3);
577 assert_eq!(read, b"abc");
578 assert_eq!(input.current_position(), Position::new(FileId::zero(), 0));
579 // Position should not change
580 }
581
582 #[test]
583 fn test_is_at() {
584 let bytes = b"abcdef";
585 let mut input = Input::new(FileId::zero(), bytes);
586
587 assert!(input.is_at(b"abc", false));
588 input.skip(2);
589 assert!(input.is_at(b"cde", false));
590 assert!(!input.is_at(b"xyz", false));
591 }
592
593 #[test]
594 fn test_is_at_ignore_ascii_case() {
595 let bytes = b"AbCdEf";
596 let mut input = Input::new(FileId::zero(), bytes);
597
598 assert!(input.is_at(b"abc", true));
599 input.skip(2);
600 assert!(input.is_at(b"cde", true));
601 assert!(!input.is_at(b"xyz", true));
602 }
603
604 #[test]
605 fn test_peek() {
606 let bytes = b"abcdef";
607 let input = Input::new(FileId::zero(), bytes);
608
609 let peeked = input.peek(2, 3);
610 assert_eq!(peeked, b"cde");
611 assert_eq!(input.current_position(), Position::new(FileId::zero(), 0));
612 // Position should not change
613 }
614
615 #[test]
616 fn test_to_bound() {
617 let bytes = b"abcdef";
618 let input = Input::new(FileId::zero(), bytes);
619
620 let (from, until) = input.calculate_bound(3);
621 assert_eq!((from, until), (0, 3));
622
623 let (from, until) = input.calculate_bound(10); // Exceeds length
624 assert_eq!((from, until), (0, 6));
625 }
626}