logos 0.11.0-rc1

Create ridiculously fast Lexers
Documentation
//! This module contains a bunch of traits necessary for processing byte strings.
//!
//! Most notable are:
//! * `Source` - implemented by default for `&str` and `&[u8]`, used by the `Lexer`.
//! * `Slice` - slices of `Source`, returned by `Lexer::slice`.

use std::fmt::Debug;
use std::ops::Range;

/// Trait for types the `Lexer` can read from.
///
/// Most notably this is implemented for `&str`. It is unlikely you will
/// ever want to use this Trait yourself, unless implementing a new `Source`
/// the `Lexer` can use.
pub trait Source {
    /// A type this `Source` can be sliced into.
    type Slice: ?Sized + PartialEq + Eq + Debug;

    /// Length of the source
    fn len(&self) -> usize;

    /// Read a chunk of bytes into an array. Returns `None` when reading
    /// out of bounds would occur.
    ///
    /// This is very useful for matching fixed-size byte arrays, and tends
    /// to be very fast at it too, since the compiler knows the byte lengths.
    ///
    /// ```rust
    /// use logos::Source;
    ///
    /// fn main() {
    ///     let foo = "foo";
    ///
    ///     assert_eq!(foo.read(0), Some(b"foo"));     // Option<&[u8; 3]>
    ///     assert_eq!(foo.read(0), Some(b"fo"));      // Option<&[u8; 2]>
    ///     assert_eq!(foo.read(2), Some(b'o'));       // Option<u8>
    ///     assert_eq!(foo.read::<&[u8; 4]>(0), None); // Out of bounds
    ///     assert_eq!(foo.read::<&[u8; 2]>(2), None); // Out of bounds
    /// }
    /// ```
    fn read<'a, Chunk>(&'a self, offset: usize) -> Option<Chunk>
    where
        Chunk: self::Chunk<'a>;

    /// Get a slice of the source at given range. This is analogous to
    /// `slice::get(range)`.
    ///
    /// ```rust
    /// use logos::Source;
    ///
    /// fn main() {
    ///     let foo = "It was the year when they finally immanentized the Eschaton.";
    ///
    ///     assert_eq!(<str as Source>::slice(&foo, 51..59), Some("Eschaton"));
    /// }
    /// ```
    fn slice(&self, range: Range<usize>) -> Option<&Self::Slice>;

    /// Get a slice of the source at given range. This is analogous to
    /// `slice::get_unchecked(range)`.
    ///
    /// **Using this method with range out of bounds is undefined behavior!**
    ///
    /// ```rust
    /// use logos::Source;
    ///
    /// fn main() {
    ///     let foo = "It was the year when they finally immanentized the Eschaton.";
    ///
    ///     unsafe {
    ///         assert_eq!(<str as Source>::slice_unchecked(&foo, 51..59), "Eschaton");
    ///     }
    /// }
    /// ```
    unsafe fn slice_unchecked(&self, range: Range<usize>) -> &Self::Slice;

    /// For `&str` sources attempts to find the closest `char` boundary at which source
    /// can be sliced, starting from `index`.
    ///
    /// For binary sources (`&[u8]`) this should just return `index` back.
    #[inline]
    fn find_boundary(&self, index: usize) -> usize {
        index
    }

    /// Check if `index` is valid for this `Source`, that is:
    ///
    /// + It's not larger than the byte length of the `Source`.
    /// + (`str` only) It doesn't land in the middle of a UTF-8 code point.
    fn is_boundary(&self, index: usize) -> bool;
}

impl Source for str {
    type Slice = str;

    #[inline]
    fn len(&self) -> usize {
        (*self).len()
    }

    #[inline]
    fn read<'a, Chunk>(&'a self, offset: usize) -> Option<Chunk>
    where
        Chunk: self::Chunk<'a>,
    {
        if offset + (Chunk::SIZE - 1) < (*self).len() {
            Some(unsafe { Chunk::from_ptr((*self).as_ptr().add(offset)) })
        } else {
            None
        }
    }

    #[inline]
    fn slice(&self, range: Range<usize>) -> Option<&str> {
        self.get(range)
    }

    #[inline]
    unsafe fn slice_unchecked(&self, range: Range<usize>) -> &str {
        debug_assert!(
            range.start <= self.len() && range.end <= self.len(),
            "Reading out of bounds {:?} for {}!",
            range,
            self.len()
        );

        self.get_unchecked(range)
    }

    #[inline]
    fn find_boundary(&self, mut index: usize) -> usize {
        while !self.is_char_boundary(index) {
            index += 1;
        }

        index
    }

    #[inline]
    fn is_boundary(&self, index: usize) -> bool {
        self.is_char_boundary(index)
    }
}

impl Source for [u8] {
    type Slice = [u8];

    #[inline]
    fn len(&self) -> usize {
        (*self).len()
    }

    #[inline]
    fn read<'a, Chunk>(&'a self, offset: usize) -> Option<Chunk>
    where
        Chunk: self::Chunk<'a>,
    {
        if offset + (Chunk::SIZE - 1) < (*self).len() {
            Some(unsafe { Chunk::from_ptr((*self).as_ptr().add(offset)) })
        } else {
            None
        }
    }

    #[inline]
    fn slice(&self, range: Range<usize>) -> Option<&[u8]> {
        self.get(range)
    }

    #[inline]
    unsafe fn slice_unchecked(&self, range: Range<usize>) -> &[u8] {
        debug_assert!(
            range.start <= self.len() && range.end <= self.len(),
            "Reading out of bounds {:?} for {}!",
            range,
            self.len()
        );

        self.get_unchecked(range)
    }

    #[inline]
    fn is_boundary(&self, index: usize) -> bool {
        index <= self.len()
    }
}

/// A fixed, statically sized chunk of data that can be read from the `Source`.
///
/// This is implemented for `u8`, as well as byte arrays `&[u8; 1]` to `&[u8; 16]`.
pub trait Chunk<'source>: Sized + Copy + PartialEq + Eq {
    /// Size of the chunk being accessed in bytes.
    const SIZE: usize;

    /// Create a chunk from a raw byte pointer.
    unsafe fn from_ptr(ptr: *const u8) -> Self;
}

impl<'source> Chunk<'source> for u8 {
    const SIZE: usize = 1;

    #[inline]
    unsafe fn from_ptr(ptr: *const u8) -> Self {
        *ptr
    }
}

macro_rules! impl_array {
    ($($size:expr),*) => ($(
        impl<'source> Chunk<'source> for &'source [u8; $size] {
            const SIZE: usize = $size;

            #[inline]
            unsafe fn from_ptr(ptr: *const u8) -> Self {
                &*(ptr as *const [u8; $size])
            }
        }
    )*);
}

impl_array!(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);