Struct Tokenizer

Source
pub struct Tokenizer<'a> {
    pub states: Vec<StateFunction>,
    /* private fields */
}
Expand description

The Tokenizer type is used to produce and store tokens for lexers.

Fields§

§states: Vec<StateFunction>

Implementations§

Source§

impl<'a> Tokenizer<'a>

Source

pub fn new(data: &str) -> Tokenizer<'_>

Initializes a new tokenizer with the given data.

§Examples
let tokenizer = luthor::Tokenizer::new("luthor");
Source

pub fn tokens(&self) -> Vec<Token>

Returns a copy of the tokens processed to date, in addition to any in-progress or remaining data appended as a text-category token. As a result, the returned tokens always produce the original dataset when their lexemes are concatenated.

§Examples
use luthor::token::{Category, Token};

// Set up a new tokenizer.
let mut tokenizer = luthor::Tokenizer::new("luthor");
tokenizer.tokenize_next(2, Category::Keyword);

assert_eq!(
    tokenizer.tokens(),
    vec![
        Token{ lexeme: "lu".to_string(), category: Category::Keyword },
        Token{ lexeme: "thor".to_string(), category: Category::Text }
    ]
);
Source

pub fn advance(&mut self)

Moves to the next character in the data. Does nothing if there is no more data to process.

§Examples
// Set up a new tokenizer.
let mut tokenizer = luthor::Tokenizer::new("luthor");

// Ensure that we're at the first character.
assert_eq!(tokenizer.current_char().unwrap(), 'l');

// Consume the first character.
tokenizer.advance();

// Ensure that we're at the next character.
assert_eq!(tokenizer.current_char().unwrap(), 'u');
Source

pub fn current_char(&self) -> Option<char>

Returns the character at the current position, unless all of the data has been processed.

§Examples
// Set up a new tokenizer.
let mut tokenizer = luthor::Tokenizer::new("l");

// Ensure that the current character is correct.
assert_eq!(tokenizer.current_char().unwrap(), 'l');

// Consume the last bit of data.
tokenizer.advance();

// Ensure that there is no current character.
assert_eq!(tokenizer.current_char(), None);
Source

pub fn next_non_whitespace_char(&self) -> Option<char>

Returns the next non-whitespace character, without advancing the cursor.

§Examples
// Set up a new tokenizer.
let mut tokenizer = luthor::Tokenizer::new("  b");

// Ask for the next non-whitespace character.
assert_eq!(tokenizer.next_non_whitespace_char().unwrap(), 'b');

// Advance past the "b" character and ask again.
for _ in 0..3 { tokenizer.advance(); }
assert!(tokenizer.next_non_whitespace_char().is_none());
Source

pub fn has_prefix(&self, prefix: &str) -> bool

Whether or not the remaining data starts with the specified prefix.

§Examples
// Set up a new tokenizer.
let tokenizer = luthor::Tokenizer::new("lex");

assert!(tokenizer.has_prefix("le"));
Source

pub fn starts_with_lexeme(&self, lexeme: &str) -> bool

Whether or not the remaining data starts with the specified lexeme. Ensures that the specified lexeme is not just a prefix by checking that the data that follows it is a newline, space, comma, or nothing at all.

§Examples
use luthor::token::Category;

// Set up a new tokenizer.
let mut tokenizer = luthor::Tokenizer::new("lex\nluthor lib,rary");

// Prefixes don't count.
assert!(!tokenizer.starts_with_lexeme("le"));

// Newlines delineate lexemes.
assert!(tokenizer.starts_with_lexeme("lex"));

// Consume 4 characters, advancing to the next lexeme.
tokenizer.tokenize_next(4, Category::Text);

// Spaces delineate lexemes.
assert!(tokenizer.starts_with_lexeme("luthor"));

// Consume 7 characters, advancing to the next lexeme.
tokenizer.tokenize_next(7, Category::Text);

// Commas delineate lexemes.
assert!(tokenizer.starts_with_lexeme("lib"));

// Consume 4 characters, advancing to the next lexeme.
tokenizer.tokenize_next(4, Category::Text);

// End of string delineates lexemes.
assert!(tokenizer.starts_with_lexeme("rary"));
Source

pub fn tokenize(&mut self, category: Category)

Creates and stores a token with the given category containing any data processed using advance since the last call to this method.

§Examples
use luthor::token::Category;

// Set up a new tokenizer.
let mut tokenizer = luthor::Tokenizer::new("luthor");

// Consume two characters and then tokenize them.
tokenizer.advance();
tokenizer.advance();
tokenizer.tokenize(Category::Text);

// Ensure that we have a correctly-categorized token.
assert_eq!(tokenizer.tokens()[0].lexeme, "lu");
Source

pub fn tokenize_next(&mut self, amount: usize, category: Category)

Creates and stores a token with the given category and the next amount characters of the data. Before doing this, it tokenizes any previously processed characters with the generic Category::Text category.

§Examples
use luthor::token::Category;
use luthor::token::Token;

// Set up a new tokenizer.
let mut tokenizer = luthor::Tokenizer::new("luthor");

// Consume one character, and then tokenize the next 5.
tokenizer.advance();
tokenizer.tokenize_next(5, Category::Keyword);

// Ensure that we have two properly-categorized tokens.
assert_eq!(
    tokenizer.tokens()[0],
    Token{ lexeme: "l".to_string(), category: Category::Text }
);
assert_eq!(
    tokenizer.tokens()[1],
    Token{ lexeme: "uthor".to_string(), category: Category::Keyword }
);
Source

pub fn consume_whitespace(&mut self)

Consumes consecutive whitespace characters as a single token.

§Examples
use luthor::token::Category;
use luthor::token::Token;

let mut tokenizer = luthor::Tokenizer::new("  \nluthor");
tokenizer.consume_whitespace();

assert_eq!(
    tokenizer.tokens()[0],
    Token{ lexeme: "  \n".to_string(), category: Category::Whitespace }
);

Auto Trait Implementations§

§

impl<'a> Freeze for Tokenizer<'a>

§

impl<'a> RefUnwindSafe for Tokenizer<'a>

§

impl<'a> Send for Tokenizer<'a>

§

impl<'a> Sync for Tokenizer<'a>

§

impl<'a> Unpin for Tokenizer<'a>

§

impl<'a> UnwindSafe for Tokenizer<'a>

Blanket Implementations§

Source§

impl<T> Any for T
where T: 'static + ?Sized,

Source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
Source§

impl<T> Borrow<T> for T
where T: ?Sized,

Source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
Source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
Source§

impl<T> From<T> for T

Source§

fn from(t: T) -> T

Returns the argument unchanged.

Source§

impl<T, U> Into<U> for T
where U: From<T>,

Source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

Source§

type Error = Infallible

The type returned in the event of a conversion error.
Source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
Source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

Source§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
Source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.