alkale 2.0.0 - Docs.rs

//! Container module for the [`SourceCodeScanner`] type.

use core::cell::UnsafeCell;
use core::str;

use crate::span::Span;

/// u8-indexable array representing how long a utf-8 character
/// is, in bytes, based on the first byte.
///
/// `0` indicates this is an invalid start character.
const CHAR_LENGTHS: [u8; 256] = [
    // Second nibble
    // 1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 0 First nibble
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 1
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 2
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 3
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 4
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 5
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 6
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 7
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 8
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 9
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // A
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // B
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // C
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // D
    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // E
    4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, // F
];

/// Shift the value 6 bits to the left and insert the lower
/// 6 bits of the byte into the now-empty space.
#[inline(always)]
const fn append_continuation_byte(base: u32, byte: u8) -> u32 {
    (base << 6) | (byte & 0b00111111) as u32
}

/// An iterator-like interface into source code. This type is not thread-safe.
///
/// This type is the backbone of all Alkale parsers, serving as the main way
/// source code is read and converted into [`Token`][crate::token::Token]s
/// and [`Notification`][crate::notification::Notification]s.
///
/// A scanner can be created simply with the [`new`][Self::new] method.
///
/// # Core Methods
/// These are the main methods used to interface with a scanner. More low-level
/// and specific versions of these methods exist, however. Such methods will be documented in
/// each core method's "See Also" section.
///
/// - [`next`][Self::next]: Consume the next character and return it.
/// - [`skip`][Self::skip]: Skip the next character.
/// - [`peek`][Self::peek]: Read the next character without consuming it.
/// - [`has_next`][Self::has_next]: Return whether more characters are left to consume.
/// - [`span`][Self::char_index]: Returns the [`Span`] the next character in the scanner.
///
/// # Additional Methods
/// If the `common` feature is not disabled, Alkale features many additional methods on
/// this type. They use the following naming conventions.
///
/// ## `try_`
/// Methods with a `try_` prefix will return an [`Option`]. If [`None`] is returned,
/// then these methods will be no-ops and consume 0 characters.
///
/// ## `consume_`
/// Methods with a `consume_` prefix will, as their name suggests, consume a section of
/// source code and return a slice to it.
///
/// ## `skip_`
/// Methods with a `skip_` prefix will continuously consume characters until EOF
/// or until a certain character type is found. They are always no-ops if already at
/// EOF or if the next character already is of the expected type.
///
/// ## `capture_`
/// Methods with a `capture_` prefix take a predicate and execute it, returning some
/// kind of metadata that would be repetitive or slow to collect manually.
///
/// ## `parse_`
/// Methods with a `parse_` prefix act similarly to `consume_` methods, but their
/// data gets processed in some way beforehand.
///
#[derive(Debug)]
pub struct SourceCodeScanner<'src> {
    /// The source byte array, this is always valid UTF-8.
    source_bytes: &'src [u8],
    /// Static pointer to the byte after the final byte in the source.
    final_ptr: *const u8,
    /// Pointer to the first byte of the next character in this stream.
    cur_char_ptr: UnsafeCell<*const u8>,
    /// Pointer to the first byte of the character *after* the next
    /// character. If `next_char_ptr == cur_char_ptr`, we must be at the end of the
    /// stream.
    next_char_ptr: UnsafeCell<*const u8>,
    /// The character pointed to by `cur_char_ptr`. If `cur_char_ptr` is equal
    /// to `next_char_ptr`, this is garbage data.
    cached_char: UnsafeCell<char>,
    /// The current char index in the source code— if 5 chars have been consumed
    /// so far, then this should be 5.
    char_index: UnsafeCell<usize>,
}

impl<'src> SourceCodeScanner<'src> {
    /// Create a new [`SourceCodeScanner`] for the given
    /// source code.
    #[must_use]
    pub fn new(source: &'src str) -> Self {
        // SAFETY: Safe because it remains in-bounds of the
        // source array. See ::add documentation.
        let final_ptr = unsafe { source.as_ptr().add(source.len()) };

        let out = Self {
            source_bytes: source.as_bytes(),
            final_ptr,
            next_char_ptr: UnsafeCell::new(source.as_ptr()),
            cur_char_ptr: UnsafeCell::new(source.as_ptr()),
            cached_char: UnsafeCell::new(' '), // Doesn't matter what char
            char_index: UnsafeCell::new(0),
        };

        out.refresh_cache();

        out
    }

    /// Align `next_char_ptr` to its proper position and refresh the char cache.
    ///
    /// This method is most useful after calls to
    /// [`skip_without_cache`][Self::skip_without_cache] to re-synchronize the
    /// scanner's character cache to resume normal usage.
    ///
    /// # Examples
    /// ```rust
    /// # use alkale::SourceCodeScanner;
    /// # use alkale::span::Span;
    /// let scanner = SourceCodeScanner::new("ABC");
    ///
    /// // Progress the scanner to 'B' but don't notify
    /// // it of the change.
    /// unsafe { scanner.skip_without_cache(); }
    ///
    /// // Outdated value returned.
    /// assert_eq!(scanner.peek(), Some('A'));
    ///
    /// // Tell the cache to refresh its state.
    /// scanner.refresh_cache();
    ///
    /// // Proper value returned now.
    /// assert_eq!(scanner.peek(), Some('B'));
    /// ```
    #[inline]
    pub fn refresh_cache(&self) {
        // SAFETY: No pointers are leaked outside of this method.
        unsafe {
            let cur_char_ptr = *self.cur_char_ptr.get();
            let next_char_ptr = self.next_char_ptr.get();

            // If no chars remain, next_char = cur_char.
            if cur_char_ptr >= self.final_ptr {
                *next_char_ptr = cur_char_ptr;
                return;
            }

            let (char, ptr) = self.read_next_char();
            *next_char_ptr = ptr;
            *self.cached_char.get() = char;
        }
    }

    /// Shift the given pointer forwards 1 byte, returning
    /// the byte that it previously pointed to.
    ///
    /// # Safety
    /// This method assumes that `ptr + 1` will not
    /// overflow [`isize::MAX`].
    #[inline(always)]
    unsafe fn take_byte(ptr: *mut *const u8) -> u8 {
        let result = **ptr;
        *ptr = (*ptr).byte_add(1);
        result
    }

    /// Read the next character in the source code.
    /// This will return the parsed character as well as a pointer to the byte
    /// after it. No state is modified.
    ///
    /// # Safety
    /// There must be at least 1 character left to consume.
    unsafe fn read_next_char(&self) -> (char, *const u8) {
        let mut ptr = *self.cur_char_ptr.get();

        let initial_byte = Self::take_byte(&mut ptr);

        // Mat: [0wwwwwww, ...]
        // Res: 0000000000000000000000000wwwwwww
        // This is an ascii char, cheap return.
        if initial_byte <= 0x7F {
            return (initial_byte as char, ptr);
        }

        // Mat: [1??wwwww, XXxxxxxx, ...]
        // Res: 000000000000000000000wwwwwxxxxxx
        // 2+ byte char
        let mut result = append_continuation_byte(
            (initial_byte & 0b00011111).into(),
            Self::take_byte(&mut ptr),
        );

        if initial_byte >= 0xE0 {
            // Mat: [111wwwww, XXxxxxxx, YYyyyyyy, ...]
            // Res: 000000000000000wwwwwxxxxxxyyyyyy
            // 3+ byte char
            result = append_continuation_byte(result, Self::take_byte(&mut ptr));

            if initial_byte >= 0xF0 {
                // Mat: [11110www, XXxxxxxx, YYyyyyyy, ZZzzzzzz, ...]
                // Res: 00000000010wwwxxxxxxyyyyyyzzzzzz
                // 4 byte char; The '1' bit above is now erroneous
                result = append_continuation_byte(result, Self::take_byte(&mut ptr));
                result &= 0b11111111_10111111_11111111_11111111;
            }
        }

        (char::from_u32_unchecked(result), ptr)
    }

    /// Same as [`read_next_char`][Self::read_next_char] but doesn't produce
    /// a [`char`] output, only a pointer. This is faster when usable.
    ///
    /// # Safety
    /// There must be at least 1 character left to consume.
    unsafe fn skip_next_char(&self) -> *const u8 {
        let ptr = *self.cur_char_ptr.get();
        let initial_byte = *ptr;
        let len = CHAR_LENGTHS.get_unchecked(initial_byte as usize);
        ptr.byte_add(*len as usize)
    }

    /// Returns `true` if at least 1 character is left to be
    /// scanned.
    ///
    /// # Examples
    /// ```rust
    /// # use alkale::SourceCodeScanner;
    /// # use alkale::span::Span;
    /// let scanner = SourceCodeScanner::new("ABC");
    ///
    /// assert_eq!(scanner.has_next(), true);
    ///
    /// scanner.skip();
    /// assert_eq!(scanner.has_next(), true);
    ///
    /// scanner.skip();
    /// scanner.skip();
    /// assert_eq!(scanner.has_next(), false);
    /// ```
    #[inline]
    pub fn has_next(&self) -> bool {
        // SAFETY: Not being cast or exposed publicly.
        unsafe { *self.cur_char_ptr.get() < self.final_ptr }
    }

    /// Consume the next character in the source code and return it. This
    /// will advance the iterator. Returns [None] if no characters remain.
    ///
    /// # Examples
    /// ```rust
    /// # use alkale::SourceCodeScanner;
    /// let scanner = SourceCodeScanner::new("ABC");
    ///
    /// assert_eq!(scanner.next(), Some('A'));
    /// assert_eq!(scanner.next(), Some('B'));
    /// assert_eq!(scanner.next(), Some('C'));
    /// assert_eq!(scanner.next(), None);
    /// ```
    ///
    /// # See Also
    /// If no return value is needed, [`skip`][Self::skip] is more idiomatic.
    #[inline]
    #[expect(clippy::if_then_some_else_none)]
    pub fn next(&self) -> Option<char> {
        // SAFETY: UnsafeCells are not being cast or exposed publicly.
        // `read_next_char` preconditions are checked.
        unsafe {
            if self.has_next() {
                let cached_char = self.cached_char.get();
                let next_char = self.next_char_ptr.get();

                // Copy the char cache to return
                let result = *cached_char;

                // Move cur_char forward to the next_char.
                *self.cur_char_ptr.get() = *next_char;

                // Calculate next_char_ptr and refresh cache
                if self.has_next() {
                    let (char, ptr) = self.read_next_char();
                    *next_char = ptr;
                    *cached_char = char;
                }

                // Increment char index
                // SAFETY: Will never overflow because this can only increase
                // every time a character is consumes— characters are stored
                // in-memory and thus have an upper bound of usize as well.
                *self.char_index.get() = (*self.char_index.get()).unchecked_add(1);

                Some(result)
            } else {
                None
            }
        }
    }

    /// Skip the next character in this iterator. No-op if no characters remain.
    ///
    /// # Examples
    /// ```rust
    /// # use alkale::SourceCodeScanner;
    /// let scanner = SourceCodeScanner::new("ABC");
    ///
    /// assert_eq!(scanner.peek(), Some('A'));
    ///
    /// scanner.skip();
    /// scanner.skip();
    /// assert_eq!(scanner.peek(), Some('C'));
    ///
    /// scanner.skip();
    /// assert_eq!(scanner.peek(), None);
    ///
    /// scanner.skip();
    /// assert_eq!(scanner.peek(), None);
    /// ```
    ///
    /// # See Also
    /// [`skip_without_cache`][Self::skip_without_cache] may be more appropriate
    /// in certain situations.
    #[inline]
    pub fn skip(&self) {
        // SAFETY: UnsafeCells are not being cast or exposed publicly.
        // `read_next_char` preconditions are checked.
        unsafe {
            if self.has_next() {
                let next_char = self.next_char_ptr.get();

                // Move cur_char forward to the next_char.
                *self.cur_char_ptr.get() = *next_char;

                // Calculate next_char_ptr and refresh cache
                if self.has_next() {
                    let (char, ptr) = self.read_next_char();
                    *next_char = ptr;
                    *self.cached_char.get() = char;
                }

                // Increment char index
                // SAFETY: Will never overflow because this can only increase
                // every time a character is consumes— characters are stored
                // in-memory and thus have an upper bound of usize as well.
                *self.char_index.get() = (*self.char_index.get()).unchecked_add(1);
            }
        }
    }

    /// Skip the next character in this iterator without updating the char cache.
    ///
    /// This will not update the scanner's internal char cache, thus calling [`next`][Self::next],
    /// [`peek`][Self::peek], or any other char-based methods after this will result the *wrong value*
    /// being returned.
    ///
    /// This method can be far more performant if characters are going to be collected in a
    /// non-char-by-char way, for example, with [`consume_range`][Self::consume_range].
    ///
    /// # Safety
    /// Ensure that no char-collecting methods are called after this.
    ///
    /// # Recovery
    /// After calling this method, the scanner can be considered to be in an *invalid state*.
    /// The [`skip`][Self::skip] and [`refresh_cache`][Self::refresh_cache] methods are the primary
    /// ways of restoring the scanner back to a valid state.
    ///
    /// # Examples
    /// ```rust
    /// # use alkale::SourceCodeScanner;
    /// # use alkale::span::Span;
    /// let scanner = SourceCodeScanner::new("ABC");
    ///
    /// // Progress the scanner to 'B' but don't notify
    /// // it of the change.
    /// unsafe { scanner.skip_without_cache(); }
    ///
    /// // Outdated value returned.
    /// assert_eq!(scanner.peek(), Some('A'));
    ///
    /// // Tell the cache to refresh its state.
    /// scanner.refresh_cache();
    ///
    /// // Proper value returned now.
    /// assert_eq!(scanner.peek(), Some('B'));
    /// ```
    ///
    /// # See Also:
    /// If the cache needs to be immidiately updated, use [`skip`][Self::skip] instead.
    #[inline]
    pub unsafe fn skip_without_cache(&self) {
        // SAFETY: UnsafeCells are not being cast or exposed publicly.
        // `read_next_char` preconditions are checked.
        unsafe {
            if self.has_next() {
                let next_char = self.next_char_ptr.get();

                // Move cur_char forward to the next_char.
                *self.cur_char_ptr.get() = *next_char;

                // Calculate next_char_ptr and refresh cache
                if self.has_next() {
                    *next_char = self.skip_next_char();
                }

                // Increment char index
                // SAFETY: Will never overflow because this can only increase
                // every time a character is consumes— characters are stored
                // in-memory and thus have an upper bound of usize as well.
                *self.char_index.get() = (*self.char_index.get()).unchecked_add(1);
            }
        }
    }

    /// Peek the next character in the source code without progressing the iterator.
    /// Returns [`None`] if no characters remain.
    ///
    /// # Examples
    /// ```rust
    /// # use alkale::SourceCodeScanner;
    /// let scanner = SourceCodeScanner::new("ABC");
    ///
    /// assert_eq!(scanner.peek(), Some('A'));
    ///
    /// scanner.skip();
    /// scanner.skip();
    /// assert_eq!(scanner.peek(), Some('C'));
    ///
    /// scanner.skip();
    /// assert_eq!(scanner.peek(), None);
    /// ```
    #[inline]
    pub fn peek(&self) -> Option<char> {
        // SAFETY: Char is returned copied, no ref leaked.
        unsafe { self.has_next().then(|| *self.cached_char.get()) }
    }

    /// Get the current character index of this stream.
    ///
    /// # Examples
    /// ```rust
    /// # use alkale::SourceCodeScanner;
    /// let scanner = SourceCodeScanner::new("Aµc");
    ///
    /// assert_eq!(scanner.char_index(), 0);
    ///
    /// scanner.skip();
    /// assert_eq!(scanner.char_index(), 1);
    ///
    /// scanner.skip();
    /// assert_eq!(scanner.char_index(), 2);
    ///
    /// scanner.skip();
    /// assert_eq!(scanner.char_index(), 3);
    /// ```
    ///
    /// # See Also
    /// Often it is easier to work with [`Span`]s than raw character indices.
    /// Use [`span`][Self::span] to get a [`Span`] over the next character in the
    /// scanner.
    #[inline]
    pub fn char_index(&self) -> usize {
        // SAFETY: Type is returned by-value via Copy.
        unsafe { *self.char_index.get() }
    }

    /// Execute the input predicate, the range of source code that was consumed
    /// is returned as a [`prim@str`] reference.
    ///
    /// # Examples
    /// ```rust
    /// # use alkale::SourceCodeScanner;
    /// # use alkale::span::Span;
    /// let scanner = SourceCodeScanner::new("abcdef");
    ///
    /// let capture = scanner.consume_range(|| {
    ///     scanner.skip();
    ///     scanner.skip();
    ///     scanner.skip();
    /// });
    ///
    /// assert_eq!(capture, "abc");
    /// ```
    ///
    /// # See Also
    /// If the `common` feature is not disabled, other `consume_` methods are
    /// generally preferred over this.
    #[inline]
    pub fn consume_range<F: FnOnce()>(&self, predicate: F) -> &'src str {
        // SAFETY: As long as `predicate` only performs valid operations
        // on the underlying bytestream, this is valid. The user will need
        // `unsafe` code to violate this contract regardless.
        unsafe {
            let initial_pointer = *self.cur_char_ptr.get();

            predicate();

            let final_pointer = *self.cur_char_ptr.get();

            let arr_ptr = self.source_bytes.as_ptr();
            let initial_index = initial_pointer.byte_offset_from(arr_ptr) as usize;
            let final_index = final_pointer.byte_offset_from(arr_ptr) as usize;
            str::from_utf8_unchecked(self.source_bytes.get_unchecked(initial_index..final_index))
        }
    }

    /// Get a [`Span`] of the character currently pointed to by this reader. (That is, the
    /// character returned by [`next`][Self::next]) The length of this span is usually `1`.
    ///
    /// If no next character exists, this [`Span`] will be the index *after* the last character, with a
    /// length of `0`.
    ///
    /// This method may be useful when a position in the source code needs to be "saved" for
    /// later reference. For example, at the beginning of a multi-character token like a number.
    /// When using it for this purpose, [`range`][Span::range] or [`up_to`][Span::up_to] may be helpful.
    ///
    /// # Examples
    /// ```rust
    /// # use alkale::SourceCodeScanner;
    /// # use alkale::span::Span;
    /// let scanner = SourceCodeScanner::new("ABC");
    ///
    /// # unsafe {
    /// assert_eq!(scanner.span(), Span::new(0, 1));
    ///
    /// scanner.skip();
    /// scanner.skip();
    /// assert_eq!(scanner.span(), Span::new(2, 1));
    ///
    /// scanner.skip();
    /// assert_eq!(scanner.span(), Span::new(3, 0));
    ///
    /// scanner.skip();
    /// assert_eq!(scanner.span(), Span::new(3, 0));
    /// # }
    /// ```
    #[must_use]
    #[inline]
    pub fn span(&self) -> Span {
        // SAFETY: char_index is always in bounds. Length
        // is set appropriately
        unsafe {
            if self.has_next() {
                Span::new(self.char_index(), 1)
            } else {
                Span::new(self.char_index(), 0)
            }
        }
    }
}