pub struct Tokenizer<'a> {
pub states: Vec<StateFunction>,
/* private fields */
}
Expand description
The Tokenizer type is used to produce and store tokens for lexers.
Fields§
§states: Vec<StateFunction>
Implementations§
Source§impl<'a> Tokenizer<'a>
impl<'a> Tokenizer<'a>
Sourcepub fn new(data: &str) -> Tokenizer<'_>
pub fn new(data: &str) -> Tokenizer<'_>
Initializes a new tokenizer with the given data.
§Examples
let tokenizer = luthor::Tokenizer::new("luthor");
Sourcepub fn tokens(&self) -> Vec<Token>
pub fn tokens(&self) -> Vec<Token>
Returns a copy of the tokens processed to date, in addition to any in-progress or remaining data appended as a text-category token. As a result, the returned tokens always produce the original dataset when their lexemes are concatenated.
§Examples
use luthor::token::{Category, Token};
// Set up a new tokenizer.
let mut tokenizer = luthor::Tokenizer::new("luthor");
tokenizer.tokenize_next(2, Category::Keyword);
assert_eq!(
tokenizer.tokens(),
vec![
Token{ lexeme: "lu".to_string(), category: Category::Keyword },
Token{ lexeme: "thor".to_string(), category: Category::Text }
]
);
Sourcepub fn advance(&mut self)
pub fn advance(&mut self)
Moves to the next character in the data. Does nothing if there is no more data to process.
§Examples
// Set up a new tokenizer.
let mut tokenizer = luthor::Tokenizer::new("luthor");
// Ensure that we're at the first character.
assert_eq!(tokenizer.current_char().unwrap(), 'l');
// Consume the first character.
tokenizer.advance();
// Ensure that we're at the next character.
assert_eq!(tokenizer.current_char().unwrap(), 'u');
Sourcepub fn current_char(&self) -> Option<char>
pub fn current_char(&self) -> Option<char>
Returns the character at the current position, unless all of the data has been processed.
§Examples
// Set up a new tokenizer.
let mut tokenizer = luthor::Tokenizer::new("l");
// Ensure that the current character is correct.
assert_eq!(tokenizer.current_char().unwrap(), 'l');
// Consume the last bit of data.
tokenizer.advance();
// Ensure that there is no current character.
assert_eq!(tokenizer.current_char(), None);
Sourcepub fn next_non_whitespace_char(&self) -> Option<char>
pub fn next_non_whitespace_char(&self) -> Option<char>
Returns the next non-whitespace character, without advancing the cursor.
§Examples
// Set up a new tokenizer.
let mut tokenizer = luthor::Tokenizer::new(" b");
// Ask for the next non-whitespace character.
assert_eq!(tokenizer.next_non_whitespace_char().unwrap(), 'b');
// Advance past the "b" character and ask again.
for _ in 0..3 { tokenizer.advance(); }
assert!(tokenizer.next_non_whitespace_char().is_none());
Sourcepub fn has_prefix(&self, prefix: &str) -> bool
pub fn has_prefix(&self, prefix: &str) -> bool
Whether or not the remaining data starts with the specified prefix.
§Examples
// Set up a new tokenizer.
let tokenizer = luthor::Tokenizer::new("lex");
assert!(tokenizer.has_prefix("le"));
Sourcepub fn starts_with_lexeme(&self, lexeme: &str) -> bool
pub fn starts_with_lexeme(&self, lexeme: &str) -> bool
Whether or not the remaining data starts with the specified lexeme. Ensures that the specified lexeme is not just a prefix by checking that the data that follows it is a newline, space, comma, or nothing at all.
§Examples
use luthor::token::Category;
// Set up a new tokenizer.
let mut tokenizer = luthor::Tokenizer::new("lex\nluthor lib,rary");
// Prefixes don't count.
assert!(!tokenizer.starts_with_lexeme("le"));
// Newlines delineate lexemes.
assert!(tokenizer.starts_with_lexeme("lex"));
// Consume 4 characters, advancing to the next lexeme.
tokenizer.tokenize_next(4, Category::Text);
// Spaces delineate lexemes.
assert!(tokenizer.starts_with_lexeme("luthor"));
// Consume 7 characters, advancing to the next lexeme.
tokenizer.tokenize_next(7, Category::Text);
// Commas delineate lexemes.
assert!(tokenizer.starts_with_lexeme("lib"));
// Consume 4 characters, advancing to the next lexeme.
tokenizer.tokenize_next(4, Category::Text);
// End of string delineates lexemes.
assert!(tokenizer.starts_with_lexeme("rary"));
Sourcepub fn tokenize(&mut self, category: Category)
pub fn tokenize(&mut self, category: Category)
Creates and stores a token with the given category containing any
data processed using advance
since the last call to this method.
§Examples
use luthor::token::Category;
// Set up a new tokenizer.
let mut tokenizer = luthor::Tokenizer::new("luthor");
// Consume two characters and then tokenize them.
tokenizer.advance();
tokenizer.advance();
tokenizer.tokenize(Category::Text);
// Ensure that we have a correctly-categorized token.
assert_eq!(tokenizer.tokens()[0].lexeme, "lu");
Sourcepub fn tokenize_next(&mut self, amount: usize, category: Category)
pub fn tokenize_next(&mut self, amount: usize, category: Category)
Creates and stores a token with the given category and the
next amount
characters of the data. Before doing this, it
tokenizes any previously processed characters with the generic
Category::Text
category.
§Examples
use luthor::token::Category;
use luthor::token::Token;
// Set up a new tokenizer.
let mut tokenizer = luthor::Tokenizer::new("luthor");
// Consume one character, and then tokenize the next 5.
tokenizer.advance();
tokenizer.tokenize_next(5, Category::Keyword);
// Ensure that we have two properly-categorized tokens.
assert_eq!(
tokenizer.tokens()[0],
Token{ lexeme: "l".to_string(), category: Category::Text }
);
assert_eq!(
tokenizer.tokens()[1],
Token{ lexeme: "uthor".to_string(), category: Category::Keyword }
);
Sourcepub fn consume_whitespace(&mut self)
pub fn consume_whitespace(&mut self)
Consumes consecutive whitespace characters as a single token.
§Examples
use luthor::token::Category;
use luthor::token::Token;
let mut tokenizer = luthor::Tokenizer::new(" \nluthor");
tokenizer.consume_whitespace();
assert_eq!(
tokenizer.tokens()[0],
Token{ lexeme: " \n".to_string(), category: Category::Whitespace }
);