mago_syntax_core/
input.rs

1use memchr::memchr;
2use memchr::memmem::find;
3
4use mago_database::file::File;
5use mago_database::file::FileId;
6use mago_span::Position;
7
8/// A struct representing the input code being lexed.
9///
10/// The `Input` struct provides methods to read, peek, consume, and skip characters
11/// from the bytes input code while keeping track of the current position (line, column, offset).
12#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
13pub struct Input<'a> {
14    pub(crate) bytes: &'a [u8],
15    pub(crate) length: usize,
16    pub(crate) offset: usize,
17    pub(crate) starting_position: Position,
18}
19
20impl<'a> Input<'a> {
21    /// Creates a new `Input` instance from the given input.
22    ///
23    /// # Arguments
24    ///
25    /// * `file_id` - The unique identifier for the source file this input belongs to.
26    /// * `bytes` - A byte slice representing the input code to be lexed.
27    ///
28    /// # Returns
29    ///
30    /// A new `Input` instance initialized at the beginning of the input.
31    pub fn new(file_id: FileId, bytes: &'a [u8]) -> Self {
32        let length = bytes.len();
33
34        Self { bytes, length, offset: 0, starting_position: Position::start_of(file_id) }
35    }
36
37    /// Creates a new `Input` instance from the contents of a `File`.
38    ///
39    /// # Arguments
40    ///
41    /// * `file` - A reference to the `File` containing the source code.
42    ///
43    /// # Returns
44    ///
45    /// A new `Input` instance initialized with the file's ID and contents.
46    pub fn from_file(file: &'a File) -> Self {
47        Self::new(file.id, file.contents.as_bytes())
48    }
49
50    /// Creates a new `Input` instance representing a byte slice that is
51    /// "anchored" at a specific absolute position within a larger source file.
52    ///
53    /// This is useful when lexing a subset (slice) of a source file, as it allows
54    /// generated tokens to retain accurate absolute positions and spans relative
55    /// to the original file.
56    ///
57    /// The internal cursor (`offset`) starts at 0 relative to the `bytes` slice,
58    /// but the absolute position is calculated relative to the `anchor_position`.
59    ///
60    /// # Arguments
61    ///
62    /// * `bytes` - A byte slice representing the input code subset to be lexed.
63    /// * `anchor_position` - The absolute `Position` in the original source file where
64    ///   the provided `bytes` slice begins.
65    ///
66    /// # Returns
67    ///
68    /// A new `Input` instance ready to lex the `bytes`, maintaining positions
69    /// relative to `anchor_position`.
70    pub fn anchored_at(bytes: &'a [u8], anchor_position: Position) -> Self {
71        let length = bytes.len();
72
73        Self { bytes, length, offset: 0, starting_position: anchor_position }
74    }
75
76    /// Returns the source file identifier of the input code.
77    #[inline]
78    pub const fn file_id(&self) -> FileId {
79        self.starting_position.file_id
80    }
81
82    /// Returns the absolute current `Position` of the lexer within the original source file.
83    ///
84    /// It calculates this by adding the internal offset (progress within the current byte slice)
85    /// to the `starting_position` the `Input` was initialized with.
86    #[inline]
87    pub const fn current_position(&self) -> Position {
88        // Calculate absolute position by adding internal offset to the starting base
89        self.starting_position.forward(self.offset)
90    }
91
92    /// Returns the current internal byte offset relative to the start of the input slice.
93    ///
94    /// This indicates how many bytes have been consumed from the current `bytes` slice.
95    /// To get the absolute position in the original source file, use `current_position()`.
96    #[inline]
97    pub const fn current_offset(&self) -> usize {
98        self.offset
99    }
100
101    /// Returns `true` if the input slice is empty (length is zero).
102    #[inline]
103    pub const fn is_empty(&self) -> bool {
104        self.length == 0
105    }
106
107    /// Returns the total length in bytes of the input slice being processed.
108    #[inline]
109    pub const fn len(&self) -> usize {
110        self.length
111    }
112
113    /// Checks if the current position is at the end of the input.
114    ///
115    /// # Returns
116    ///
117    /// `true` if the current offset is greater than or equal to the input length; `false` otherwise.
118    #[inline]
119    pub const fn has_reached_eof(&self) -> bool {
120        self.offset >= self.length
121    }
122
123    /// Returns a byte slice within a specified absolute range.
124    ///
125    /// The `from` and `to` arguments are absolute byte offsets from the beginning
126    /// of the original source file. The method calculates the correct slice
127    /// relative to the `starting_position` of this `Input`.
128    ///
129    /// This is useful for retrieving the raw text of a `Span` or `Token` whose
130    /// positions are absolute, even when the `Input` only contains a subsection
131    /// of the source file.
132    ///
133    /// The returned slice is defensively clamped to the bounds of the current
134    /// `Input`'s byte slice to prevent panics.
135    ///
136    /// # Arguments
137    ///
138    /// * `from` - The absolute starting byte offset.
139    /// * `to` - The absolute ending byte offset (exclusive).
140    ///
141    /// # Returns
142    ///
143    /// A byte slice `&[u8]` corresponding to the requested range.
144    #[inline]
145    pub fn slice_in_range(&self, from: usize, to: usize) -> &'a [u8] {
146        let base_offset = self.starting_position.offset;
147
148        // Calculate the start and end positions relative to the local `bytes` slice.
149        // `saturating_sub` prevents underflow if `from`/`to` are smaller than `base_offset`.
150        let local_from = from.saturating_sub(base_offset);
151        let local_to = to.saturating_sub(base_offset);
152
153        // Clamp the local indices to the actual length of the `bytes` slice to prevent panics.
154        let start = local_from.min(self.length);
155        let end = local_to.min(self.length);
156
157        // Ensure the start index is not greater than the end index.
158        if start >= end {
159            return &[];
160        }
161
162        // If the start index is beyond the length of the input, return an empty slice.
163        if start >= self.length {
164            return &[];
165        }
166
167        &self.bytes[start..end]
168    }
169
170    /// Advances the current position by one character, updating line and column numbers.
171    ///
172    /// Handles different line endings (`\n`, `\r`, `\r\n`) and updates line and column counters accordingly.
173    ///
174    /// If the end of input is reached, no action is taken.
175    #[inline]
176    pub fn next(&mut self) {
177        if !self.has_reached_eof() {
178            self.offset += 1;
179        }
180    }
181
182    /// Skips the next `count` characters, advancing the position accordingly.
183    ///
184    /// Updates line and column numbers as it advances.
185    ///
186    /// # Arguments
187    ///
188    /// * `count` - The number of characters to skip.
189    #[inline]
190    pub fn skip(&mut self, count: usize) {
191        self.offset = (self.offset + count).min(self.length);
192    }
193
194    /// Consumes the next `count` characters and returns them as a slice.
195    ///
196    /// Advances the position by `count` characters.
197    ///
198    /// # Arguments
199    ///
200    /// * `count` - The number of characters to consume.
201    ///
202    /// # Returns
203    ///
204    /// A byte slice containing the consumed characters.
205    #[inline]
206    pub fn consume(&mut self, count: usize) -> &'a [u8] {
207        let (from, until) = self.calculate_bound(count);
208
209        self.skip(count);
210
211        &self.bytes[from..until]
212    }
213
214    /// Consumes all remaining characters from the current position to the end of input.
215    ///
216    /// Advances the position to EOF.
217    ///
218    /// # Returns
219    ///
220    /// A byte slice containing the remaining characters.
221    #[inline]
222    pub fn consume_remaining(&mut self) -> &'a [u8] {
223        if self.has_reached_eof() {
224            return &[];
225        }
226
227        let from = self.offset;
228        self.offset = self.length;
229
230        &self.bytes[from..]
231    }
232
233    /// Consumes characters until the given byte slice is found.
234    ///
235    /// Advances the position to the start of the search slice if found,
236    /// or to EOF if not found.
237    ///
238    /// # Arguments
239    ///
240    /// * `search` - The byte slice to search for.
241    /// * `ignore_ascii_case` - Whether to ignore ASCII case when comparing characters.
242    ///
243    /// # Returns
244    ///
245    /// A byte slice containing the consumed characters.
246    #[inline]
247    pub fn consume_until(&mut self, search: &[u8], ignore_ascii_case: bool) -> &'a [u8] {
248        let start = self.offset;
249        if !ignore_ascii_case {
250            // For a single-byte search, use memchr.
251            if search.len() == 1 {
252                if let Some(pos) = memchr(search[0], &self.bytes[self.offset..]) {
253                    self.offset += pos;
254                    &self.bytes[start..self.offset]
255                } else {
256                    self.offset = self.length;
257                    &self.bytes[start..self.length]
258                }
259            } else if let Some(pos) = find(&self.bytes[self.offset..], search) {
260                self.offset += pos;
261                &self.bytes[start..self.offset]
262            } else {
263                self.offset = self.length;
264                &self.bytes[start..self.length]
265            }
266        } else {
267            while !self.has_reached_eof() && !self.is_at(search, ignore_ascii_case) {
268                self.offset += 1;
269            }
270
271            &self.bytes[start..self.offset]
272        }
273    }
274
275    #[inline]
276    pub fn consume_through(&mut self, search: u8) -> &'a [u8] {
277        let start = self.offset;
278        if let Some(pos) = memchr::memchr(search, &self.bytes[self.offset..]) {
279            self.offset += pos + 1;
280
281            &self.bytes[start..self.offset]
282        } else {
283            self.offset = self.length;
284
285            &self.bytes[start..self.length]
286        }
287    }
288
289    /// Consumes whitespaces until a non-whitespace character is found.
290    ///
291    /// # Returns
292    ///
293    /// A byte slice containing the consumed whitespaces.
294    #[inline]
295    pub fn consume_whitespaces(&mut self) -> &'a [u8] {
296        let start = self.offset;
297        let bytes = self.bytes;
298        let len = self.length;
299        while self.offset < len && bytes[self.offset].is_ascii_whitespace() {
300            self.offset += 1;
301        }
302
303        &bytes[start..self.offset]
304    }
305
306    /// Reads the next `n` characters without advancing the position.
307    ///
308    /// # Arguments
309    ///
310    /// * `n` - The number of characters to read.
311    ///
312    /// # Returns
313    ///
314    /// A byte slice containing the next `n` characters.
315    #[inline]
316    pub fn read(&self, n: usize) -> &'a [u8] {
317        let (from, until) = self.calculate_bound(n);
318
319        &self.bytes[from..until]
320    }
321
322    /// Reads a single byte at a specific byte offset within the input slice,
323    /// without advancing the internal cursor.
324    ///
325    /// This provides direct, low-level access to the underlying byte data.
326    ///
327    /// # Arguments
328    ///
329    /// * `at` - The zero-based byte offset within the input slice (`self.bytes`)
330    ///   from which to read the byte.
331    ///
332    /// # Returns
333    ///
334    /// A reference to the byte located at the specified offset `at`.
335    ///
336    /// # Panics
337    ///
338    /// This method **panics** if the provided `at` offset is out of bounds
339    /// for the input byte slice (i.e., if `at >= self.bytes.len()`).
340    pub fn read_at(&self, at: usize) -> &'a u8 {
341        &self.bytes[at]
342    }
343
344    /// Checks if the input at the current position matches the given byte slice.
345    ///
346    /// # Arguments
347    ///
348    /// * `search` - The byte slice to compare against the input.
349    /// * `ignore_ascii_case` - Whether to ignore ASCII case when comparing.
350    ///
351    /// # Returns
352    ///
353    /// `true` if the next bytes match `search`; `false` otherwise.
354    #[inline]
355    pub fn is_at(&self, search: &[u8], ignore_ascii_case: bool) -> bool {
356        let (from, until) = self.calculate_bound(search.len());
357        let slice = &self.bytes[from..until];
358
359        if ignore_ascii_case { slice.eq_ignore_ascii_case(search) } else { slice == search }
360    }
361
362    /// Attempts to match the given byte sequence at the current position, ignoring whitespace in the input.
363    ///
364    /// This method tries to match the provided byte slice `search` against the input starting
365    /// from the current position, possibly ignoring ASCII case. Whitespace characters in the input
366    /// are skipped during matching, but their length is included in the returned length.
367    ///
368    /// Importantly, the method **does not include** any trailing whitespace **after** the matched sequence
369    /// in the returned length.
370    ///
371    /// For example, to match the sequence `(string)`, the input could be `(string)`, `( string )`, `(  string )`, etc.,
372    /// and this method would return the total length of the input consumed to match `(string)`,
373    /// including any whitespace within the matched sequence, but **excluding** any whitespace after it.
374    ///
375    /// # Arguments
376    ///
377    /// * `search` - The byte slice to match against the input.
378    /// * `ignore_ascii_case` - If `true`, ASCII case is ignored during comparison.
379    ///
380    /// # Returns
381    ///
382    /// * `Some(length)` - If the input matches `search` (ignoring whitespace within the sequence), returns the total length
383    ///   of the input consumed to match `search`, including any skipped whitespace **within** the matched sequence.
384    /// * `None` - If the input does not match `search`.
385    #[inline]
386    pub const fn match_sequence_ignore_whitespace(&self, search: &[u8], ignore_ascii_case: bool) -> Option<usize> {
387        let mut offset = self.offset;
388        let mut search_offset = 0;
389        let mut length = 0;
390        let bytes = self.bytes;
391        let total = self.length;
392        while search_offset < search.len() {
393            // Skip whitespace in the input.
394            while offset < total && bytes[offset].is_ascii_whitespace() {
395                offset += 1;
396                length += 1;
397            }
398
399            if offset >= total {
400                return None;
401            }
402
403            let input_byte = bytes[offset];
404            let search_byte = search[search_offset];
405            let matched = if ignore_ascii_case {
406                input_byte.eq_ignore_ascii_case(&search_byte)
407            } else {
408                input_byte == search_byte
409            };
410
411            if matched {
412                offset += 1;
413                length += 1;
414                search_offset += 1;
415            } else {
416                return None;
417            }
418        }
419
420        Some(length)
421    }
422
423    /// Peeks ahead `i` characters and reads the next `n` characters without advancing the position.
424    ///
425    /// # Arguments
426    ///
427    /// * `offset` - The number of characters to skip before reading.
428    /// * `n` - The number of characters to read after skipping.
429    ///
430    /// # Returns
431    ///
432    /// A byte slice containing the peeked characters.
433    #[inline]
434    pub fn peek(&self, offset: usize, n: usize) -> &'a [u8] {
435        let from = self.offset + offset;
436        if from >= self.length {
437            return &self.bytes[self.length..self.length];
438        }
439
440        let mut until = from + n;
441        if until >= self.length {
442            until = self.length;
443        }
444
445        &self.bytes[from..until]
446    }
447
448    /// Calculates the bounds for slicing the input safely.
449    ///
450    /// Ensures that slicing does not go beyond the input length.
451    ///
452    /// # Arguments
453    ///
454    /// * `n` - The number of characters to include in the slice.
455    ///
456    /// # Returns
457    ///
458    /// A tuple `(from, until)` representing the start and end indices for slicing.
459    #[inline]
460    const fn calculate_bound(&self, n: usize) -> (usize, usize) {
461        if self.has_reached_eof() {
462            return (self.length, self.length);
463        }
464
465        let mut until = self.offset + n;
466
467        if until >= self.length {
468            until = self.length;
469        }
470
471        (self.offset, until)
472    }
473}
474
475#[cfg(test)]
476mod tests {
477    use mago_span::Position;
478
479    use super::*;
480
481    #[test]
482    fn test_new() {
483        let bytes = b"Hello, world!";
484        let input = Input::new(FileId::zero(), bytes);
485
486        assert_eq!(input.current_position(), Position::new(FileId::zero(), 0));
487        assert_eq!(input.length, bytes.len());
488        assert_eq!(input.bytes, bytes);
489    }
490
491    #[test]
492    fn test_is_eof() {
493        let bytes = b"";
494        let input = Input::new(FileId::zero(), bytes);
495
496        assert!(input.has_reached_eof());
497
498        let bytes = b"data";
499        let mut input = Input::new(FileId::zero(), bytes);
500
501        assert!(!input.has_reached_eof());
502
503        input.skip(4);
504
505        assert!(input.has_reached_eof());
506    }
507
508    #[test]
509    fn test_next() {
510        let bytes = b"a\nb\r\nc\rd";
511        let mut input = Input::new(FileId::zero(), bytes);
512
513        // 'a'
514        input.next();
515        assert_eq!(input.current_position(), Position::new(FileId::zero(), 1));
516
517        // '\n'
518        input.next();
519        assert_eq!(input.current_position(), Position::new(FileId::zero(), 2));
520
521        // 'b'
522        input.next();
523        assert_eq!(input.current_position(), Position::new(FileId::zero(), 3));
524
525        // '\r\n' should be treated as one newline
526        input.next();
527        assert_eq!(input.current_position(), Position::new(FileId::zero(), 4));
528
529        // 'c'
530        input.next();
531        assert_eq!(input.current_position(), Position::new(FileId::zero(), 5));
532
533        // '\r'
534        input.next();
535        assert_eq!(input.current_position(), Position::new(FileId::zero(), 6));
536
537        // 'd'
538        input.next();
539        assert_eq!(input.current_position(), Position::new(FileId::zero(), 7));
540    }
541
542    #[test]
543    fn test_consume() {
544        let bytes = b"abcdef";
545        let mut input = Input::new(FileId::zero(), bytes);
546
547        let consumed = input.consume(3);
548        assert_eq!(consumed, b"abc");
549        assert_eq!(input.current_position(), Position::new(FileId::zero(), 3));
550
551        let consumed = input.consume(3);
552        assert_eq!(consumed, b"def");
553        assert_eq!(input.current_position(), Position::new(FileId::zero(), 6));
554
555        let consumed = input.consume(1); // Should return empty slice at EOF
556        assert_eq!(consumed, b"");
557        assert!(input.has_reached_eof());
558    }
559
560    #[test]
561    fn test_consume_remaining() {
562        let bytes = b"abcdef";
563        let mut input = Input::new(FileId::zero(), bytes);
564
565        input.skip(2);
566        let remaining = input.consume_remaining();
567        assert_eq!(remaining, b"cdef");
568        assert!(input.has_reached_eof());
569    }
570
571    #[test]
572    fn test_read() {
573        let bytes = b"abcdef";
574        let input = Input::new(FileId::zero(), bytes);
575
576        let read = input.read(3);
577        assert_eq!(read, b"abc");
578        assert_eq!(input.current_position(), Position::new(FileId::zero(), 0));
579        // Position should not change
580    }
581
582    #[test]
583    fn test_is_at() {
584        let bytes = b"abcdef";
585        let mut input = Input::new(FileId::zero(), bytes);
586
587        assert!(input.is_at(b"abc", false));
588        input.skip(2);
589        assert!(input.is_at(b"cde", false));
590        assert!(!input.is_at(b"xyz", false));
591    }
592
593    #[test]
594    fn test_is_at_ignore_ascii_case() {
595        let bytes = b"AbCdEf";
596        let mut input = Input::new(FileId::zero(), bytes);
597
598        assert!(input.is_at(b"abc", true));
599        input.skip(2);
600        assert!(input.is_at(b"cde", true));
601        assert!(!input.is_at(b"xyz", true));
602    }
603
604    #[test]
605    fn test_peek() {
606        let bytes = b"abcdef";
607        let input = Input::new(FileId::zero(), bytes);
608
609        let peeked = input.peek(2, 3);
610        assert_eq!(peeked, b"cde");
611        assert_eq!(input.current_position(), Position::new(FileId::zero(), 0));
612        // Position should not change
613    }
614
615    #[test]
616    fn test_to_bound() {
617        let bytes = b"abcdef";
618        let input = Input::new(FileId::zero(), bytes);
619
620        let (from, until) = input.calculate_bound(3);
621        assert_eq!((from, until), (0, 3));
622
623        let (from, until) = input.calculate_bound(10); // Exceeds length
624        assert_eq!((from, until), (0, 6));
625    }
626}