Struct Tokenizer

Source

pub struct Tokenizer<'a> { /* private fields */ }

Expand description

The Crossandra tokenizer, operating on literals and patterns.

§Literals

Literals indicate values that have to be exactly matched by the tokenizer. They are represented by a slice of (name, value) pairs. For example, a literal map for Brainfuck would be defined like this:

let literals = [
    ("add", "+"),
    ("sub", "-"),
    ("left", "<"),
    ("right", ">"),
    ("read", ","),
    ("write", "."),
    ("begin_loop", "["),
    ("end_loop", "]"),
];

Literals take precedence over patterns.

§Patterns

Patterns are regular expressions that match more complex token structures. They are represented as pairs of strings (name, pattern) in a Vec to maintain a consistent matching order.

The order of patterns matters as the tokenizer will use the first matching pattern it finds. Duplicate pattern names are not allowed and will result in an error. This crate also provides a collection of commonly used patterns in the common module. For example, patterns covering binary, octal, and hexadecimal literals could be defined like this:

let patterns = vec![
    ("binary".into(), r"0[bB][01]+".into()),
    ("octal".into(), r"0[Oo][0-7]+".into()),
    ("hexadecimal".into(), r"(?i)0x[0-9a-f]+".into()),
];

§Other options

§`ignore_whitespace`

Whether to ignore the following whitespace characters:

Code	Character
`0x9`	Tab (`\t`)
`0xa`	Line feed (`\n`)
`0xb`	Vertical tab
`0xc`	Form feed
`0xd`	Carriage return (`\r`)
`0x20`	Space ( )

Defaults to false.

§`ignored_characters`

A set of characters to ignore during tokenization. Defaults to an empty Vec.

§Fast Mode

When all literals are of length 1 and there are no patterns, Crossandra uses a simpler tokenization method.

For instance, tokenizing a 1MB random Brainfuck file with 10% of the file being comments is ~300x faster with Fast Mode (32.5s vs 110ms on Apple M2).

Do note that this is a rather extreme case; for a 1KB file, the speedup is ~2.3x.

Struct Tokenizer Copy item path

§Literals

§Patterns

§Other options

§ignore_whitespace

§ignored_characters

§Fast Mode

Implementations§

impl<'a> Tokenizer<'a>

pub fn new( literals: &[(&'a str, &'a str)], patterns: Vec<(String, String)>, ignored_characters: FxHashSet<char>, ignore_whitespace: bool, ) -> Result<Self, Error>

§Errors

pub fn tokenize( &'a self, source: &'a str, ) -> Box<dyn Iterator<Item = Result<Token, Error>> + 'a>

pub fn tokenize_lines( &'a self, source: &'a str, ) -> impl ParallelIterator<Item = Result<Vec<Token>, Error>> + 'a

§Errors

pub fn with_literals( self, literals: &[(&'a str, &'a str)], ) -> Result<Self, Error>

§Errors

pub fn with_patterns( self, patterns: Vec<(String, String)>, ) -> Result<Self, Error>

§Errors

pub fn with_ignored_characters( self, ignored_characters: FxHashSet<char>, ) -> Self

pub fn with_ignore_whitespace(self, ignore_whitespace: bool) -> Self

pub fn set_literals( &mut self, literals: &[(&'a str, &'a str)], ) -> Result<(), Error>

§Errors

pub fn set_patterns( &mut self, patterns: Vec<(String, String)>, ) -> Result<(), Error>

§Errors

pub fn set_ignored_characters(&mut self, ignored_characters: FxHashSet<char>)

pub fn set_ignore_whitespace(&mut self, ignore_whitespace: bool)

Trait Implementations§

impl<'a> Clone for Tokenizer<'a>

fn clone(&self) -> Tokenizer<'a>

fn clone_from(&mut self, source: &Self)

impl<'a> Debug for Tokenizer<'a>

fn fmt(&self, f: &mut Formatter<'_>) -> Result

impl Default for Tokenizer<'_>

fn default() -> Self

impl PartialEq for Tokenizer<'_>

fn eq(&self, other: &Self) -> bool

fn ne(&self, other: &Rhs) -> bool

impl Eq for Tokenizer<'_>

Auto Trait Implementations§

impl<'a> Freeze for Tokenizer<'a>

impl<'a> RefUnwindSafe for Tokenizer<'a>

impl<'a> Send for Tokenizer<'a>

impl<'a> Sync for Tokenizer<'a>

impl<'a> Unpin for Tokenizer<'a>

impl<'a> UnwindSafe for Tokenizer<'a>

Blanket Implementations§

impl<T> Any for Twhere T: 'static + ?Sized,

fn type_id(&self) -> TypeId

impl<T> Borrow<T> for Twhere T: ?Sized,

fn borrow(&self) -> &T

impl<T> BorrowMut<T> for Twhere T: ?Sized,

fn borrow_mut(&mut self) -> &mut T

impl<T> CloneToUninit for Twhere T: Clone,

unsafe fn clone_to_uninit(&self, dest: *mut u8)

impl<T> From<T> for T

fn from(t: T) -> T

impl<T, U> Into<U> for Twhere U: From<T>,

fn into(self) -> U

impl<T> IntoEither for T

fn into_either(self, into_left: bool) -> Either<Self, Self>

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>where F: FnOnce(&Self) -> bool,

impl<T> Pointable for T

const ALIGN: usize

type Init = T

unsafe fn init(init: <T as Pointable>::Init) -> usize

unsafe fn deref<'a>(ptr: usize) -> &'a T

unsafe fn deref_mut<'a>(ptr: usize) -> &'a mut T

unsafe fn drop(ptr: usize)

impl<T> ToOwned for Twhere T: Clone,

type Owned = T

fn to_owned(&self) -> T

fn clone_into(&self, target: &mut T)

impl<T, U> TryFrom<U> for Twhere U: Into<T>,

type Error = Infallible

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

impl<T, U> TryInto<U> for Twhere U: TryFrom<T>,

type Error = <U as TryFrom<T>>::Error

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Struct Tokenizer

§`ignore_whitespace`

§`ignored_characters`

impl<T> Any for T
where T: 'static + ?Sized,

impl<T> Borrow<T> for T
where T: ?Sized,

impl<T> BorrowMut<T> for T
where T: ?Sized,

impl<T> CloneToUninit for T
where T: Clone,

impl<T, U> Into<U> for T
where U: From<T>,

fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
where F: FnOnce(&Self) -> bool,

impl<T> ToOwned for T
where T: Clone,

impl<T, U> TryFrom<U> for T
where U: Into<T>,

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,