Struct Tokenizer

Source

pub struct Tokenizer<'a> {
    pub states: Vec<StateFunction>,
    /* private fields */
}

Expand description

The Tokenizer type is used to produce and store tokens for lexers.

Fields§

§states: Vec<StateFunction>

Implementations§

Source §

impl<'a> Tokenizer<'a>

Source

pub fn new(data: &str) -> Tokenizer<'_>

Initializes a new tokenizer with the given data.

§Examples

let tokenizer = luthor::Tokenizer::new("luthor");

Source

pub fn tokens(&self) -> Vec<Token>

Returns a copy of the tokens processed to date, in addition to any in-progress or remaining data appended as a text-category token. As a result, the returned tokens always produce the original dataset when their lexemes are concatenated.

§Examples

use luthor::token::{Category, Token};

// Set up a new tokenizer.
let mut tokenizer = luthor::Tokenizer::new("luthor");
tokenizer.tokenize_next(2, Category::Keyword);

assert_eq!(
    tokenizer.tokens(),
    vec![
        Token{ lexeme: "lu".to_string(), category: Category::Keyword },
        Token{ lexeme: "thor".to_string(), category: Category::Text }
    ]
);

Source

pub fn advance(&mut self)

Moves to the next character in the data. Does nothing if there is no more data to process.

§Examples

// Set up a new tokenizer.
let mut tokenizer = luthor::Tokenizer::new("luthor");

// Ensure that we're at the first character.
assert_eq!(tokenizer.current_char().unwrap(), 'l');

// Consume the first character.
tokenizer.advance();

// Ensure that we're at the next character.
assert_eq!(tokenizer.current_char().unwrap(), 'u');

Source

pub fn current_char(&self) -> Option<char>

Returns the character at the current position, unless all of the data has been processed.

§Examples

// Set up a new tokenizer.
let mut tokenizer = luthor::Tokenizer::new("l");

// Ensure that the current character is correct.
assert_eq!(tokenizer.current_char().unwrap(), 'l');

// Consume the last bit of data.
tokenizer.advance();

// Ensure that there is no current character.
assert_eq!(tokenizer.current_char(), None);

Source

pub fn next_non_whitespace_char(&self) -> Option<char>

Returns the next non-whitespace character, without advancing the cursor.

§Examples

// Set up a new tokenizer.
let mut tokenizer = luthor::Tokenizer::new("  b");

// Ask for the next non-whitespace character.
assert_eq!(tokenizer.next_non_whitespace_char().unwrap(), 'b');

// Advance past the "b" character and ask again.
for _ in 0..3 { tokenizer.advance(); }
assert!(tokenizer.next_non_whitespace_char().is_none());

Source

pub fn has_prefix(&self, prefix: &str) -> bool

Whether or not the remaining data starts with the specified prefix.

§Examples

// Set up a new tokenizer.
let tokenizer = luthor::Tokenizer::new("lex");

assert!(tokenizer.has_prefix("le"));

Source

pub fn starts_with_lexeme(&self, lexeme: &str) -> bool

Whether or not the remaining data starts with the specified lexeme. Ensures that the specified lexeme is not just a prefix by checking that the data that follows it is a newline, space, comma, or nothing at all.

§Examples

use luthor::token::Category;

// Set up a new tokenizer.
let mut tokenizer = luthor::Tokenizer::new("lex\nluthor lib,rary");

// Prefixes don't count.
assert!(!tokenizer.starts_with_lexeme("le"));

// Newlines delineate lexemes.
assert!(tokenizer.starts_with_lexeme("lex"));

// Consume 4 characters, advancing to the next lexeme.
tokenizer.tokenize_next(4, Category::Text);

// Spaces delineate lexemes.
assert!(tokenizer.starts_with_lexeme("luthor"));

// Consume 7 characters, advancing to the next lexeme.
tokenizer.tokenize_next(7, Category::Text);

// Commas delineate lexemes.
assert!(tokenizer.starts_with_lexeme("lib"));

// Consume 4 characters, advancing to the next lexeme.
tokenizer.tokenize_next(4, Category::Text);

// End of string delineates lexemes.
assert!(tokenizer.starts_with_lexeme("rary"));

Source

pub fn tokenize(&mut self, category: Category)

Creates and stores a token with the given category containing any data processed using advance since the last call to this method.

§Examples

use luthor::token::Category;

// Set up a new tokenizer.
let mut tokenizer = luthor::Tokenizer::new("luthor");

// Consume two characters and then tokenize them.
tokenizer.advance();
tokenizer.advance();
tokenizer.tokenize(Category::Text);

// Ensure that we have a correctly-categorized token.
assert_eq!(tokenizer.tokens()[0].lexeme, "lu");

Source

pub fn tokenize_next(&mut self, amount: usize, category: Category)

Creates and stores a token with the given category and the next amount characters of the data. Before doing this, it tokenizes any previously processed characters with the generic Category::Text category.

§Examples

use luthor::token::Category;
use luthor::token::Token;

// Set up a new tokenizer.
let mut tokenizer = luthor::Tokenizer::new("luthor");

// Consume one character, and then tokenize the next 5.
tokenizer.advance();
tokenizer.tokenize_next(5, Category::Keyword);

// Ensure that we have two properly-categorized tokens.
assert_eq!(
    tokenizer.tokens()[0],
    Token{ lexeme: "l".to_string(), category: Category::Text }
);
assert_eq!(
    tokenizer.tokens()[1],
    Token{ lexeme: "uthor".to_string(), category: Category::Keyword }
);

Source

pub fn consume_whitespace(&mut self)

Consumes consecutive whitespace characters as a single token.

§Examples

use luthor::token::Category;
use luthor::token::Token;

let mut tokenizer = luthor::Tokenizer::new("  \nluthor");
tokenizer.consume_whitespace();

assert_eq!(
    tokenizer.tokens()[0],
    Token{ lexeme: "  \n".to_string(), category: Category::Whitespace }
);