use std::iter::Peekable;
use std::str::Chars;
#[derive(Clone, Debug)]
pub struct Tokenizer<'a> {
input: &'a str,
index: usize,
chars: Peekable<Chars<'a>>,
}
impl<'a> Tokenizer<'a> {
pub fn new(input: &'a str) -> Self {
Self {
input,
index: 0,
chars: input.chars().peekable(),
}
}
#[allow(dead_code)]
pub fn input(&self) -> &str {
self.input
}
pub fn as_str(&self) -> &str {
&self.input[self.index..]
}
pub fn mark(&self) -> usize {
self.index
}
#[allow(dead_code)]
pub fn tail(&self, mark: usize) -> &str {
&self.input[mark..]
}
#[allow(clippy::should_implement_trait)]
pub fn next(&mut self) -> Option<char> {
let ch = self.chars.next();
if let Some(c) = ch {
self.index += c.len_utf8();
}
ch
}
pub fn peek(&mut self) -> Option<char> {
self.chars.peek().copied()
}
pub fn token(&self, mark: usize) -> &str {
assert!(mark <= self.index, "mark follows index");
&self.input[mark..self.index]
}
#[allow(dead_code)]
pub fn token2(&self, mark: usize, index: usize) -> &str {
assert!(mark <= index, "mark follows index");
&self.input[mark..index]
}
fn reset_to(&mut self, mark: usize) {
self.index = mark;
self.chars = self.input[self.index..].chars().peekable();
}
pub fn is(&mut self, ch: char) -> bool {
if let Some(c) = self.chars.peek() {
*c == ch
} else {
false
}
}
pub fn has<P>(&mut self, predicate: P) -> bool
where
P: Fn(&char) -> bool,
{
if let Some(ch) = self.chars.peek() {
predicate(ch)
} else {
false
}
}
pub fn at_end(&mut self) -> bool {
self.chars.peek().is_none()
}
pub fn skip(&mut self) {
self.next();
}
pub fn skip_char(&mut self, ch: char) {
assert!(self.is(ch));
self.next();
}
pub fn skip_over(&mut self, num_chars: usize) {
for _ in 0..num_chars {
self.next();
}
}
pub fn skip_while<P>(&mut self, predicate: P)
where
P: Fn(&char) -> bool,
{
while let Some(ch) = self.chars.peek() {
if predicate(ch) {
self.next();
} else {
break;
}
}
}
pub fn backslash_subst(&mut self) -> char {
self.skip_char('\\');
let start = self.mark();
if let Some(c) = self.next() {
match c {
'a' => '\x07', 'b' => '\x08', 'f' => '\x0c', 'n' => '\n', 'r' => '\r', 't' => '\t', 'v' => '\x0b',
'0'..='7' => {
while self.has(|ch| ch.is_digit(8)) && self.index - start < 3 {
self.next();
}
let octal = &self.input[start..self.index];
let val = u8::from_str_radix(octal, 8).unwrap();
val as char
}
'x' | 'u' | 'U' => {
let mark = self.mark();
let max = match c {
'x' => 2,
'u' => 4,
'U' => 8,
_ => unreachable!(),
};
while self.has(|ch| ch.is_digit(16)) && self.index - mark < max {
self.next();
}
if self.index == mark {
return c;
}
let hex = &self.input[mark..self.index];
let val = u32::from_str_radix(&hex, 16).unwrap();
if let Some(ch) = std::char::from_u32(val) {
ch
} else {
self.reset_to(mark);
c
}
}
_ => c,
}
} else {
'\\'
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_basics() {
let mut ptr = Tokenizer::new("abc");
assert_eq!(ptr.input(), "abc");
assert_eq!(ptr.as_str(), "abc");
assert_eq!(ptr.peek(), Some('a'));
}
#[test]
fn test_next() {
let mut ptr = Tokenizer::new("abc");
assert_eq!(ptr.next(), Some('a'));
assert_eq!(ptr.as_str(), "bc");
assert_eq!(ptr.next(), Some('b'));
assert_eq!(ptr.as_str(), "c");
assert_eq!(ptr.next(), Some('c'));
assert_eq!(ptr.as_str(), "");
assert_eq!(ptr.next(), None);
}
#[test]
fn test_token() {
let mut ptr = Tokenizer::new("abcdef");
ptr.next();
ptr.next();
assert_eq!(ptr.as_str(), "cdef");
let start = ptr.mark();
ptr.next();
ptr.next();
assert_eq!(ptr.token(start), "cd");
assert_eq!(ptr.as_str(), "ef");
let ptr = Tokenizer::new("abc");
let start = ptr.mark();
assert_eq!(ptr.token(start), "");
}
#[test]
fn test_peek() {
let mut ptr = Tokenizer::new("abcdef");
assert_eq!(ptr.peek(), Some('a'));
assert_eq!(ptr.as_str(), "abcdef");
ptr.next();
ptr.next();
assert_eq!(ptr.peek(), Some('c'));
assert_eq!(ptr.as_str(), "cdef");
}
#[test]
fn test_reset_to() {
let mut ptr = Tokenizer::new("abcdef");
ptr.next();
ptr.next();
ptr.reset_to(0);
assert_eq!(ptr.as_str(), "abcdef");
assert_eq!(ptr.peek(), Some('a'));
ptr.next();
ptr.next();
let start = ptr.mark();
ptr.next();
ptr.next();
ptr.reset_to(start);
assert_eq!(ptr.as_str(), "cdef");
assert_eq!(ptr.peek(), Some('c'));
}
#[test]
fn test_is() {
let mut ptr = Tokenizer::new("a");
assert!(ptr.is('a'));
assert!(!ptr.is('b'));
ptr.next();
assert!(!ptr.is('a'));
}
#[test]
fn test_has() {
let mut ptr = Tokenizer::new("a1");
assert!(ptr.has(|c| c.is_alphabetic()));
ptr.skip();
assert!(!ptr.has(|c| c.is_alphabetic()));
ptr.skip();
assert!(!ptr.has(|c| c.is_alphabetic()));
}
#[test]
fn test_skip() {
let mut ptr = Tokenizer::new("abc");
assert_eq!(ptr.peek(), Some('a'));
assert_eq!(ptr.as_str(), "abc");
ptr.skip();
assert_eq!(ptr.peek(), Some('b'));
assert_eq!(ptr.as_str(), "bc");
ptr.skip();
assert_eq!(ptr.peek(), Some('c'));
assert_eq!(ptr.as_str(), "c");
ptr.skip();
assert_eq!(ptr.peek(), None);
assert_eq!(ptr.as_str(), "");
}
#[test]
fn test_skip_over() {
let mut ptr = Tokenizer::new("abc");
ptr.skip_over(2);
assert_eq!(ptr.peek(), Some('c'));
assert_eq!(ptr.as_str(), "c");
let mut ptr = Tokenizer::new("abc");
ptr.skip_over(3);
assert_eq!(ptr.peek(), None);
assert_eq!(ptr.as_str(), "");
let mut ptr = Tokenizer::new("abc");
ptr.skip_over(6);
assert_eq!(ptr.peek(), None);
assert_eq!(ptr.as_str(), "");
}
#[test]
fn test_skip_while() {
let mut ptr = Tokenizer::new("aaabc");
ptr.skip_while(|ch| *ch == 'a');
assert_eq!(ptr.peek(), Some('b'));
assert_eq!(ptr.as_str(), "bc");
let mut ptr = Tokenizer::new("aaa");
ptr.skip_while(|ch| *ch == 'a');
assert_eq!(ptr.peek(), None);
assert_eq!(ptr.as_str(), "");
}
#[test]
fn test_backslash_subst_single() {
assert_eq!(bsubst("\\a-"), ('\x07', Some('-')));
assert_eq!(bsubst("\\b-"), ('\x08', Some('-')));
assert_eq!(bsubst("\\f-"), ('\x0c', Some('-')));
assert_eq!(bsubst("\\n-"), ('\n', Some('-')));
assert_eq!(bsubst("\\r-"), ('\r', Some('-')));
assert_eq!(bsubst("\\t-"), ('\t', Some('-')));
assert_eq!(bsubst("\\v-"), ('\x0b', Some('-')));
}
#[test]
fn test_backslash_subst_octal() {
assert_eq!(bsubst("\\1-"), ('\x01', Some('-')));
assert_eq!(bsubst("\\17-"), ('\x0f', Some('-')));
assert_eq!(bsubst("\\177-"), ('\x7f', Some('-')));
assert_eq!(bsubst("\\1772-"), ('\x7f', Some('2')));
assert_eq!(bsubst("\\18-"), ('\x01', Some('8')));
assert_eq!(bsubst("\\8-"), ('8', Some('-')));
}
#[test]
fn test_backslash_subst_hex2() {
assert_eq!(bsubst("\\x-"), ('x', Some('-')));
assert_eq!(bsubst("\\x1-"), ('\x01', Some('-')));
assert_eq!(bsubst("\\x7f-"), ('\x7f', Some('-')));
}
#[test]
fn test_backslash_subst_hex4() {
assert_eq!(bsubst("\\u-"), ('u', Some('-')));
assert_eq!(bsubst("\\u7-"), ('\x07', Some('-')));
assert_eq!(bsubst("\\u77-"), ('w', Some('-')));
assert_eq!(bsubst("\\u077-"), ('w', Some('-')));
assert_eq!(bsubst("\\u0077-"), ('w', Some('-')));
assert_eq!(bsubst("\\u00077-"), ('\x07', Some('7')));
}
#[test]
fn test_backslash_subst_hex8() {
assert_eq!(bsubst("\\U-"), ('U', Some('-')));
assert_eq!(bsubst("\\U7-"), ('\x07', Some('-')));
assert_eq!(bsubst("\\U77-"), ('w', Some('-')));
assert_eq!(bsubst("\\U077-"), ('w', Some('-')));
assert_eq!(bsubst("\\U0077-"), ('w', Some('-')));
assert_eq!(bsubst("\\U00077-"), ('w', Some('-')));
assert_eq!(bsubst("\\U000077-"), ('w', Some('-')));
assert_eq!(bsubst("\\U0000077-"), ('w', Some('-')));
assert_eq!(bsubst("\\U00000077-"), ('w', Some('-')));
assert_eq!(bsubst("\\U000000077-"), ('\x07', Some('7')));
}
#[test]
fn test_backslash_subst_other() {
assert_eq!(bsubst("\\*-"), ('*', Some('-')));
assert_eq!(bsubst("\\"), ('\\', None));
}
fn bsubst(input: &str) -> (char, Option<char>) {
let mut ctx = Tokenizer::new(input);
(ctx.backslash_subst(), ctx.as_str().chars().next())
}
}