Struct luthor::Tokenizer
[−]
[src]
pub struct Tokenizer<'a> { pub states: Vec<StateFunction>, // some fields omitted }
The Tokenizer type is used to produce and store tokens for lexers.
Fields
states: Vec<StateFunction>
Methods
impl<'a> Tokenizer<'a>
[src]
fn new(data: &str) -> Tokenizer
Initializes a new tokenizer with the given data.
Examples
let tokenizer = luthor::Tokenizer::new("luthor");
fn tokens(&self) -> Vec<Token>
Returns a copy of the tokens processed to date, in addition to any in-progress or remaining data appended as a text-category token. As a result, the returned tokens always produce the original dataset when their lexemes are concatenated.
Examples
use luthor::token::{Category, Token}; // Set up a new tokenizer. let mut tokenizer = luthor::Tokenizer::new("luthor"); tokenizer.tokenize_next(2, Category::Keyword); assert_eq!( tokenizer.tokens(), vec![ Token{ lexeme: "lu".to_string(), category: Category::Keyword }, Token{ lexeme: "thor".to_string(), category: Category::Text } ] );
fn advance(&mut self)
Moves to the next character in the data. Does nothing if there is no more data to process.
Examples
// Set up a new tokenizer. let mut tokenizer = luthor::Tokenizer::new("luthor"); // Ensure that we're at the first character. assert_eq!(tokenizer.current_char().unwrap(), 'l'); // Consume the first character. tokenizer.advance(); // Ensure that we're at the next character. assert_eq!(tokenizer.current_char().unwrap(), 'u');
fn current_char(&self) -> Option<char>
Returns the character at the current position, unless all of the data has been processed.
Examples
// Set up a new tokenizer. let mut tokenizer = luthor::Tokenizer::new("l"); // Ensure that the current character is correct. assert_eq!(tokenizer.current_char().unwrap(), 'l'); // Consume the last bit of data. tokenizer.advance(); // Ensure that there is no current character. assert_eq!(tokenizer.current_char(), None);
fn next_non_whitespace_char(&self) -> Option<char>
Returns the next non-whitespace character, without advancing the cursor.
Examples
// Set up a new tokenizer. let mut tokenizer = luthor::Tokenizer::new(" b"); // Ask for the next non-whitespace character. assert_eq!(tokenizer.next_non_whitespace_char().unwrap(), 'b'); // Advance past the "b" character and ask again. for _ in 0..3 { tokenizer.advance(); } assert!(tokenizer.next_non_whitespace_char().is_none());
fn has_prefix(&self, prefix: &str) -> bool
Whether or not the remaining data starts with the specified prefix.
Examples
// Set up a new tokenizer. let tokenizer = luthor::Tokenizer::new("lex"); assert!(tokenizer.has_prefix("le"));
fn starts_with_lexeme(&self, lexeme: &str) -> bool
Whether or not the remaining data starts with the specified lexeme. Ensures that the specified lexeme is not just a prefix by checking that the data that follows it is a newline, space, comma, or nothing at all.
Examples
use luthor::token::Category; // Set up a new tokenizer. let mut tokenizer = luthor::Tokenizer::new("lex\nluthor lib,rary"); // Prefixes don't count. assert!(!tokenizer.starts_with_lexeme("le")); // Newlines delineate lexemes. assert!(tokenizer.starts_with_lexeme("lex")); // Consume 4 characters, advancing to the next lexeme. tokenizer.tokenize_next(4, Category::Text); // Spaces delineate lexemes. assert!(tokenizer.starts_with_lexeme("luthor")); // Consume 7 characters, advancing to the next lexeme. tokenizer.tokenize_next(7, Category::Text); // Commas delineate lexemes. assert!(tokenizer.starts_with_lexeme("lib")); // Consume 4 characters, advancing to the next lexeme. tokenizer.tokenize_next(4, Category::Text); // End of string delineates lexemes. assert!(tokenizer.starts_with_lexeme("rary"));
fn tokenize(&mut self, category: Category)
Creates and stores a token with the given category containing any
data processed using advance
since the last call to this method.
Examples
use luthor::token::Category; // Set up a new tokenizer. let mut tokenizer = luthor::Tokenizer::new("luthor"); // Consume two characters and then tokenize them. tokenizer.advance(); tokenizer.advance(); tokenizer.tokenize(Category::Text); // Ensure that we have a correctly-categorized token. assert_eq!(tokenizer.tokens()[0].lexeme, "lu");
fn tokenize_next(&mut self, amount: usize, category: Category)
Creates and stores a token with the given category and the
next amount
characters of the data. Before doing this, it
tokenizes any previously processed characters with the generic
Category::Text
category.
Examples
use luthor::token::Category; use luthor::token::Token; // Set up a new tokenizer. let mut tokenizer = luthor::Tokenizer::new("luthor"); // Consume one character, and then tokenize the next 5. tokenizer.advance(); tokenizer.tokenize_next(5, Category::Keyword); // Ensure that we have two properly-categorized tokens. assert_eq!( tokenizer.tokens()[0], Token{ lexeme: "l".to_string(), category: Category::Text } ); assert_eq!( tokenizer.tokens()[1], Token{ lexeme: "uthor".to_string(), category: Category::Keyword } );
fn consume_whitespace(&mut self)
Consumes consecutive whitespace characters as a single token.
Examples
use luthor::token::Category; use luthor::token::Token; let mut tokenizer = luthor::Tokenizer::new(" \nluthor"); tokenizer.consume_whitespace(); assert_eq!( tokenizer.tokens()[0], Token{ lexeme: " \n".to_string(), category: Category::Whitespace } );