use std::fmt;
use std::error::Error;
#[derive(Clone,Copy,Debug,PartialEq)]
pub struct SourceOffset {
pub char: usize,
pub line: usize,
pub column: usize,
}
impl SourceOffset {
pub fn new(char: usize, line: usize, column: usize) -> SourceOffset {
SourceOffset {
char: char,
line: line,
column: column,
}
}
}
#[derive(Clone,Copy,Debug,PartialEq)]
pub struct SourceLocation {
pub start: SourceOffset,
pub end: SourceOffset,
}
impl SourceLocation {
pub fn new(start: SourceOffset, end: SourceOffset) -> SourceLocation {
SourceLocation {
start: start,
end: end,
}
}
}
#[derive(Clone,Debug)]
pub enum TokenizerError {
CharacterNotAllowedHere(usize),
SpecialNotYetImplemented(usize),
EscapingBackslashAtEndOfInput,
UnclosedDoubleQuoteAtEndOfInput,
UnclosedSingleQuoteAtEndOfInput,
}
impl Error for TokenizerError {
fn description(&self) -> &str {
match *self {
TokenizerError::CharacterNotAllowedHere(_) => "Character not allowed here",
TokenizerError::SpecialNotYetImplemented(_) => "Special not yet implemented",
TokenizerError::EscapingBackslashAtEndOfInput => "Escaping backlash at end of input",
TokenizerError::UnclosedDoubleQuoteAtEndOfInput => {
"Unclosed double quote at end of input"
}
TokenizerError::UnclosedSingleQuoteAtEndOfInput => {
"Unclosed single quote at end of input"
}
}
}
}
impl fmt::Display for TokenizerError {
fn fmt(&self, f: &mut fmt::Formatter) -> Result<(), fmt::Error> {
self.description().fmt(f)
}
}
#[derive(Clone,Copy,Debug,PartialEq)]
pub enum TokenType {
Whitespace,
Word,
}
#[derive(Clone,Copy,Debug,PartialEq)]
pub struct Token<'text> {
pub text: &'text str,
pub token_type: TokenType,
pub location: SourceLocation,
}
impl<'text> Token<'text> {
pub fn new(text: &'text str, token_type: TokenType, location: SourceLocation) -> Token {
Token {
text: text,
token_type: token_type,
location: location,
}
}
}
#[derive(Clone,Copy,PartialEq)]
enum State {
Initial,
Special,
Whitespace,
Doublequote,
DoublequoteBackslash,
Singlequote,
SinglequoteBackslash,
Word,
WordBackslash,
}
struct Tokenizer<'text> {
text: &'text str,
state: State,
token_type: Option<TokenType>,
token_start: usize,
token_end: usize,
tokens: Vec<Token<'text>>,
}
impl<'text> Tokenizer<'text> {
fn new(text: &'text str) -> Tokenizer {
Tokenizer {
text: text,
state: State::Initial,
token_type: None,
token_start: 0,
token_end: 0,
tokens: vec![],
}
}
fn reset(&mut self) {
self.state = State::Initial;
self.token_type = None;
self.token_start = 0;
self.token_end = 0;
}
fn reduce(&mut self) {
let token_text = &self.text[self.token_start..self.token_end + 1];
let loc = SourceLocation::new(SourceOffset::new(self.token_start, 0, self.token_start),
SourceOffset::new(self.token_end, 0, self.token_end));
self.tokens.push(Token::new(token_text,
self.token_type.expect("Invalid tokenization"),
loc));
self.reset();
}
fn shift(&mut self, offset: usize, next_state: State) {
self.recognize(offset, next_state);
self.token_end = offset;
self.state = next_state;
}
fn recognize(&mut self, offset: usize, next_state: State) {
if self.token_type.is_none() {
self.token_type = if next_state == State::Whitespace {
Some(TokenType::Whitespace)
} else {
Some(TokenType::Word)
};
self.token_start = offset;
}
}
fn special(&mut self, offset: usize) {
self.shift(offset, State::Special);
self.reduce();
}
fn initial(&mut self, offset: usize, c: char) {
if c.is_whitespace() {
self.shift(offset, State::Whitespace);
} else if c == ';' || c == '?' || c == '|' {
self.special(offset);
} else if c == '"' {
self.shift(offset, State::Doublequote);
} else if c == '\'' {
self.shift(offset, State::Singlequote);
} else if c == '\\' {
self.recognize(offset, State::Word);
self.shift(offset, State::WordBackslash);
} else {
self.shift(offset, State::Word);
}
}
fn tokenize(&mut self) -> Result<(), TokenizerError> {
for (offset, c) in self.text.chars().enumerate() {
match self.state {
State::Initial => self.initial(offset, c),
State::Whitespace => {
if c.is_whitespace() {
self.shift(offset, State::Whitespace);
} else {
self.reduce();
self.initial(offset, c);
};
}
State::Word => {
if c.is_whitespace() {
self.reduce();
self.shift(offset, State::Whitespace);
} else if c == ';' || c == '|' {
self.reduce();
self.special(offset);
} else if c == '"' {
self.reduce();
self.shift(offset, State::Doublequote);
} else if c == '\'' {
self.reduce();
self.shift(offset, State::Singlequote);
} else if c == '\\' {
self.shift(offset, State::WordBackslash);
} else {
self.shift(offset, State::Word);
}
}
State::WordBackslash => {
if c.is_alphanumeric() || c.is_whitespace() {
self.shift(offset, State::Word);
} else {
return Err(TokenizerError::CharacterNotAllowedHere(offset));
};
}
State::Doublequote => {
if c == '"' {
self.shift(offset, State::Doublequote);
self.reduce();
} else if c == '\\' {
self.shift(offset, State::DoublequoteBackslash);
} else {
self.shift(offset, State::Doublequote);
};
}
State::DoublequoteBackslash => {
if !c.is_whitespace() {
self.shift(offset, State::Doublequote);
} else {
return Err(TokenizerError::CharacterNotAllowedHere(offset));
};
}
State::Singlequote => {
if c == '\'' {
self.shift(offset, State::Singlequote);
self.reduce();
} else if c == '\\' {
self.shift(offset, State::SinglequoteBackslash);
} else {
self.shift(offset, State::Singlequote);
};
}
State::SinglequoteBackslash => {
if !c.is_whitespace() {
self.shift(offset, State::Singlequote);
} else {
return Err(TokenizerError::CharacterNotAllowedHere(offset));
};
}
State::Special => {
return Err(TokenizerError::SpecialNotYetImplemented(offset));
}
}
}
match self.state {
State::Initial => {}
State::Word | State::Whitespace => self.reduce(),
State::WordBackslash => return Err(TokenizerError::EscapingBackslashAtEndOfInput),
State::Doublequote => return Err(TokenizerError::UnclosedDoubleQuoteAtEndOfInput),
State::Singlequote => return Err(TokenizerError::UnclosedSingleQuoteAtEndOfInput),
State::DoublequoteBackslash |
State::SinglequoteBackslash => {
return Err(TokenizerError::EscapingBackslashAtEndOfInput)
}
State::Special => {
return Err(TokenizerError::SpecialNotYetImplemented(self.text.len() - 1))
}
}
Ok(())
}
}
pub fn tokenize(text: &str) -> Result<Vec<Token>, TokenizerError> {
let mut tokenizer = Tokenizer::new(text);
match tokenizer.tokenize() {
Ok(_) => Ok(tokenizer.tokens),
Err(error) => Err(error),
}
}
#[cfg(test)]
mod test {
use super::*;
fn mk_token(text: &str, token_type: TokenType, start: usize, end: usize) -> Token {
Token::new(text,
token_type,
SourceLocation::new(SourceOffset::new(start, 0, start),
SourceOffset::new(end, 0, end)))
}
#[test]
fn empty_test() {
match tokenize("") {
Ok(ts) => assert_eq!(ts.len(), 0),
_ => {}
};
}
#[test]
fn single_word() {
match tokenize("a") {
Ok(ts) => {
assert_eq!(ts.len(), 1);
assert_eq!(ts[0], mk_token("a", TokenType::Word, 0, 0));
}
_ => {}
};
}
#[test]
fn multiple_words() {
match tokenize(" aa bb ccc ") {
Ok(ts) => {
assert_eq!(ts.len(), 7);
assert_eq!(ts[0], mk_token(" ", TokenType::Whitespace, 0, 0));
assert_eq!(ts[1], mk_token("aa", TokenType::Word, 1, 2));
assert_eq!(ts[2], mk_token(" ", TokenType::Whitespace, 3, 3));
assert_eq!(ts[3], mk_token("bb", TokenType::Word, 4, 5));
assert_eq!(ts[4], mk_token(" ", TokenType::Whitespace, 6, 7));
assert_eq!(ts[5], mk_token("ccc", TokenType::Word, 8, 10));
assert_eq!(ts[6], mk_token(" ", TokenType::Whitespace, 11, 11));
}
_ => {}
};
}
#[test]
fn double_quoted_text() {
match tokenize(r#"a "b c""#) {
Ok(ts) => {
assert_eq!(ts.len(), 3);
assert_eq!(ts[0], mk_token("a", TokenType::Word, 0, 0));
assert_eq!(ts[1], mk_token(" ", TokenType::Whitespace, 1, 1));
assert_eq!(ts[2], mk_token(r#""b c""#, TokenType::Word, 2, 6));
}
_ => {}
};
}
#[test]
fn single_quoted_text() {
match tokenize(r#"a '"b c"'"#) {
Ok(ts) => {
assert_eq!(ts.len(), 3);
assert_eq!(ts[0], mk_token("a", TokenType::Word, 0, 0));
assert_eq!(ts[1], mk_token(" ", TokenType::Whitespace, 1, 1));
assert_eq!(ts[2], mk_token(r#"'"b c"'"#, TokenType::Word, 2, 8));
}
_ => {}
};
}
#[test]
fn escaped_whitespace_in_word() {
match tokenize(r#"a\ b"#) {
Ok(ts) => {
assert_eq!(ts.len(), 1);
assert_eq!(ts[0], mk_token(r#"a\ b"#, TokenType::Word, 0, 3));
}
_ => {}
};
}
#[test]
fn character_not_allowed_here() {
match tokenize(r#"ab \!"#) {
Err(TokenizerError::CharacterNotAllowedHere(_)) => {}
_ => panic!(),
};
match tokenize(r#"ab "\ ab"#) {
Err(TokenizerError::CharacterNotAllowedHere(_)) => {}
_ => panic!(),
};
}
#[test]
#[should_panic]
fn escaping_backslash_at_end_of_input() {
match tokenize(r#"ab \"#) {
Err(TokenizerError::EscapingBackslashAtEndOfInput) => panic!(),
_ => {}
}
}
#[test]
#[should_panic]
fn unclosed_double_quote_at_end_of_input() {
match tokenize(r#"ab ""#) {
Err(TokenizerError::UnclosedDoubleQuoteAtEndOfInput) => panic!(),
_ => {}
}
}
#[test]
#[should_panic]
fn escaped_double_quote_at_end_of_input() {
match tokenize(r#"ab "\"#) {
Err(TokenizerError::EscapingBackslashAtEndOfInput) => panic!(),
_ => {}
}
}
}