use alloc::borrow::Cow;
use alloc::string::String;
use alloc::vec::Vec;
use core::{char, iter::Peekable};
use crate::serialize::txt::errors::{LexerError, LexerResult};
pub(crate) struct Lexer<'a> {
txt: Peekable<CowChars<'a>>,
state: State,
}
impl<'a> Lexer<'a> {
pub(crate) fn new(txt: impl Into<Cow<'a, str>>) -> Self {
Lexer {
txt: CowChars {
data: txt.into(),
offset: 0,
}
.peekable(),
state: State::StartLine,
}
}
pub(crate) fn next_token(&mut self) -> LexerResult<Option<Token>> {
let mut char_data_vec: Option<Vec<String>> = None;
let mut char_data: Option<String> = None;
for i in 0..4_096 {
assert!(i < 4095);
let ch: Option<char> = self.peek();
match self.state {
State::StartLine => {
match ch {
Some('\r') | Some('\n') => {
self.state = State::EOL;
}
Some(ch) if ch.is_whitespace() => self.state = State::Blank,
Some(_) => self.state = State::RestOfLine,
None => {
self.state = State::EOF;
}
}
}
State::RestOfLine => {
match ch {
Some('@') => self.state = State::At,
Some('(') => {
self.txt.next();
char_data_vec = Some(Vec::new());
self.state = State::List;
}
Some(ch @ ')') => return Err(LexerError::IllegalCharacter(ch)),
Some('$') => {
self.txt.next();
char_data = Some(String::new());
self.state = State::Dollar;
}
Some('\r') | Some('\n') => {
self.state = State::EOL;
}
Some('"') => {
self.txt.next();
char_data = Some(String::new());
self.state = State::Quote;
}
Some(';') => self.state = State::Comment { is_list: false },
Some(ch) if ch.is_whitespace() => {
self.txt.next();
} Some(ch) if !ch.is_control() && !ch.is_whitespace() => {
char_data = Some(String::new());
self.state = State::CharData { is_list: false };
}
Some(ch) => return Err(LexerError::UnrecognizedChar(ch)),
None => {
self.state = State::EOF;
}
}
}
State::Blank => {
self.txt.next();
self.state = State::RestOfLine;
return Ok(Some(Token::Blank));
}
State::Comment { is_list } => {
match ch {
Some('\r') | Some('\n') => {
self.state = if is_list { State::List } else { State::EOL };
} Some(_) => {
self.txt.next();
} None => {
self.state = State::EOF;
}
}
}
State::Quote => {
match ch {
Some('"') => {
self.state = State::RestOfLine;
self.txt.next();
return Ok(Some(Token::CharData(
char_data.take().unwrap_or_else(|| "".into()),
)));
}
Some('\\') => {
Self::push_to_str(&mut char_data, self.escape_seq()?)?;
}
Some(ch) => {
self.txt.next();
Self::push_to_str(&mut char_data, ch)?;
}
None => return Err(LexerError::UnclosedQuotedString),
}
}
State::Dollar => {
match ch {
Some(ch @ 'A'..='Z') => {
self.txt.next();
Self::push_to_str(&mut char_data, ch)?;
}
Some(_) | None => {
self.state = State::RestOfLine;
let dollar: String = char_data.take().ok_or({
LexerError::IllegalState(
"char_data \
is None",
)
})?;
return Ok(Some(match dollar.as_str() {
"INCLUDE" => Token::Include,
"ORIGIN" => Token::Origin,
"TTL" => Token::Ttl,
_ => {
return Err(LexerError::UnrecognizedDollar(
char_data.take().unwrap_or_else(|| "".into()),
));
}
}));
}
}
}
State::List => match ch {
Some(';') => {
self.txt.next();
self.state = State::Comment { is_list: true }
}
Some(')') => {
self.txt.next();
self.state = State::RestOfLine;
return match char_data_vec.take() {
Some(v) => Ok(Some(Token::List(v))),
None => Err(LexerError::IllegalState("char_data_vec is None")),
};
}
Some(ch) if ch.is_whitespace() => {
self.txt.next();
}
Some(ch) if !ch.is_control() && !ch.is_whitespace() => {
char_data = Some(String::new());
self.state = State::CharData { is_list: true }
}
Some(ch) => return Err(LexerError::UnrecognizedChar(ch)),
None => return Err(LexerError::UnclosedList),
},
State::CharData { is_list } => {
match ch {
Some(ch @ ')') if !is_list => {
return Err(LexerError::IllegalCharacter(ch));
}
Some(ch) if ch.is_whitespace() || ch == ')' || ch == ';' => {
if is_list {
char_data_vec
.as_mut()
.ok_or(LexerError::IllegalState("char_data_vec is None"))
.and_then(|v| {
let char_data = char_data
.take()
.ok_or(LexerError::IllegalState("char_data is None"))?;
v.push(char_data);
Ok(())
})?;
self.state = State::List;
} else {
self.state = State::RestOfLine;
return match char_data.take() {
Some(s) => Ok(Some(Token::CharData(s))),
None => Err(LexerError::IllegalState("char_data is None")),
};
}
}
Some(ch) if !ch.is_control() && !ch.is_whitespace() => {
self.txt.next();
Self::push_to_str(&mut char_data, ch)?;
}
Some(ch) => return Err(LexerError::UnrecognizedChar(ch)),
None => {
self.state = State::EOF;
return char_data
.take()
.ok_or(LexerError::IllegalState("char_data is None"))
.map(|s| Some(Token::CharData(s)));
}
}
}
State::At => {
self.txt.next();
self.state = State::RestOfLine;
return Ok(Some(Token::At));
}
State::EOL => match ch {
Some('\r') => {
self.txt.next();
}
Some('\n') => {
self.txt.next();
self.state = State::StartLine;
return Ok(Some(Token::EOL));
}
Some(ch) => return Err(LexerError::IllegalCharacter(ch)),
None => return Err(LexerError::EOF),
},
State::EOF => {
self.txt.next(); return Ok(None);
}
}
}
unreachable!("The above match statement should have found a terminal state");
}
fn push_to_str(collect: &mut Option<String>, ch: char) -> LexerResult<()> {
let Some(s) = collect.as_mut() else {
return Err(LexerError::IllegalState("collect is None"));
};
s.push(ch);
Ok(())
}
fn escape_seq(&mut self) -> LexerResult<char> {
self.txt.next(); let ch = self.peek().ok_or(LexerError::EOF)?;
if !ch.is_control() {
if ch.is_numeric() {
let d1: u32 = self
.txt
.next()
.ok_or(LexerError::EOF)
.map(|c| c.to_digit(10).ok_or(LexerError::IllegalCharacter(c)))??; let d2: u32 = self
.txt
.next()
.ok_or(LexerError::EOF)
.map(|c| c.to_digit(10).ok_or(LexerError::IllegalCharacter(c)))??; let d3: u32 = self
.txt
.next()
.ok_or(LexerError::EOF)
.map(|c| c.to_digit(10).ok_or(LexerError::IllegalCharacter(c)))??;
let val: u32 = (d1 << 16) + (d2 << 8) + d3;
let ch: char = char::from_u32(val).ok_or(LexerError::UnrecognizedOctet(val))?;
Ok(ch)
} else {
self.txt.next(); Ok(ch)
}
} else {
Err(LexerError::IllegalCharacter(ch))
}
}
fn peek(&mut self) -> Option<char> {
self.txt.peek().copied()
}
}
struct CowChars<'a> {
data: Cow<'a, str>,
offset: usize,
}
impl Iterator for CowChars<'_> {
type Item = char;
fn next(&mut self) -> Option<char> {
let mut iter = self.data[self.offset..].char_indices();
let (_, ch) = iter.next()?; match iter.next() {
Some((idx, _)) => self.offset += idx,
None => self.offset = self.data.len(),
}
Some(ch)
}
}
#[doc(hidden)]
#[derive(Copy, Clone, PartialEq, Debug)]
pub(crate) enum State {
StartLine,
RestOfLine,
Blank, List, CharData { is_list: bool }, Comment { is_list: bool }, At, Quote, Dollar, EOL, EOF,
}
#[derive(Eq, PartialEq, Debug, Clone)]
pub enum Token {
Blank,
List(Vec<String>),
CharData(String),
At,
Include,
Origin,
Ttl,
EOL,
}
#[cfg(test)]
mod lex_test {
use alloc::string::ToString;
use super::*;
#[allow(clippy::uninlined_format_args)]
fn next_token(lexer: &mut Lexer<'_>) -> Option<Token> {
let result = lexer.next_token();
assert!(result.is_ok(), "{:?}", result);
result.unwrap()
}
#[test]
fn blank() {
let mut lexer = Lexer::new(" dead beef");
assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("dead".to_string())
);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("beef".to_string())
);
let mut lexer = Lexer::new("dead beef");
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("dead".to_string())
);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("beef".to_string())
);
let mut lexer = Lexer::new("dead beef\r\n after");
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("dead".to_string())
);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("beef".to_string())
);
assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("after".to_string())
);
let mut lexer = Lexer::new(
"dead beef ();comment
after",
);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("dead".to_string())
);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("beef".to_string())
);
assert_eq!(next_token(&mut lexer).unwrap(), Token::List(vec![]));
assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("after".to_string())
);
}
#[test]
fn escape() {
assert_eq!(
Lexer::new("a\\Aa").next_token().unwrap().unwrap(),
Token::CharData("a\\Aa".to_string())
);
assert_eq!(
Lexer::new("a\\$").next_token().unwrap().unwrap(),
Token::CharData("a\\$".to_string())
);
assert_eq!(
Lexer::new("a\\077").next_token().unwrap().unwrap(),
Token::CharData("a\\077".to_string())
);
}
#[test]
fn quoted_txt() {
assert_eq!(
Lexer::new("\"Quoted\"").next_token().unwrap().unwrap(),
Token::CharData("Quoted".to_string())
);
assert_eq!(
Lexer::new("\";@$\"").next_token().unwrap().unwrap(),
Token::CharData(";@$".to_string())
);
assert_eq!(
Lexer::new("\"some \\A\"").next_token().unwrap().unwrap(),
Token::CharData("some A".to_string())
);
assert_eq!(
Lexer::new("\"a\\Aa\"").next_token().unwrap().unwrap(),
Token::CharData("aAa".to_string())
);
assert_eq!(
Lexer::new("\"a\\$\"").next_token().unwrap().unwrap(),
Token::CharData("a$".to_string())
);
assert_eq!(
Lexer::new("\"a\\077\"").next_token().unwrap().unwrap(),
Token::CharData("a\u{707}".to_string())
);
assert!(Lexer::new("\"a\\\"").next_token().is_err());
assert!(Lexer::new("\"a\\0\"").next_token().is_err());
assert!(Lexer::new("\"a\\07\"").next_token().is_err());
let mut lexer = Lexer::new("\"multi\nline\ntext\"");
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("multi\nline\ntext".to_string())
);
assert_eq!(next_token(&mut lexer), None);
let mut lexer = Lexer::new("\"multi\r\nline\r\ntext\"\r\n");
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("multi\r\nline\r\ntext".to_string())
);
assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
assert_eq!(next_token(&mut lexer), None);
assert!(Lexer::new("\"multi").next_token().is_err());
}
#[test]
fn unicode() {
assert_eq!(
Lexer::new("♥").next_token().unwrap().unwrap(),
Token::CharData("♥".to_string())
);
}
#[test]
fn lex() {
assert_eq!(
next_token(&mut Lexer::new(".")).unwrap(),
Token::CharData(".".to_string())
);
assert_eq!(
next_token(&mut Lexer::new(" .")).unwrap(),
Token::Blank
);
assert_eq!(
next_token(&mut Lexer::new("abc")).unwrap(),
Token::CharData("abc".to_string())
);
assert_eq!(
next_token(&mut Lexer::new("abc.")).unwrap(),
Token::CharData("abc.".to_string())
);
assert_eq!(next_token(&mut Lexer::new(";abc")), None);
assert_eq!(next_token(&mut Lexer::new(";;@$-\"")), None);
assert_eq!(next_token(&mut Lexer::new("@")).unwrap(), Token::At);
assert_eq!(
next_token(&mut Lexer::new("123")).unwrap(),
Token::CharData("123".to_string())
);
assert_eq!(
next_token(&mut Lexer::new("$INCLUDE")).unwrap(),
Token::Include
);
assert_eq!(
next_token(&mut Lexer::new("$ORIGIN")).unwrap(),
Token::Origin
);
assert_eq!(next_token(&mut Lexer::new("$TTL")).unwrap(), Token::Ttl);
assert_eq!(next_token(&mut Lexer::new("\n")), Some(Token::EOL));
assert_eq!(next_token(&mut Lexer::new("\r\n")), Some(Token::EOL));
}
#[test]
fn list() {
let mut lexer = Lexer::new("(");
assert!(lexer.next_token().is_err());
assert!(Lexer::new(")").next_token().is_err());
let mut lexer = Lexer::new("()");
assert_eq!(next_token(&mut lexer).unwrap(), Token::List(vec![]));
assert_eq!(next_token(&mut lexer), None);
let mut lexer = Lexer::new("(abc)");
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::List(vec!["abc".to_string()])
);
assert_eq!(next_token(&mut lexer), None);
let mut lexer = Lexer::new("(\nabc\n)");
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::List(vec!["abc".to_string()])
);
assert_eq!(next_token(&mut lexer), None);
let mut lexer = Lexer::new("(\nabc\nabc)");
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::List(vec!["abc".to_string(), "abc".to_string()])
);
assert_eq!(next_token(&mut lexer), None);
let mut lexer = Lexer::new("(\nabc;comment\n)");
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::List(vec!["abc".to_string()])
);
assert_eq!(next_token(&mut lexer), None);
}
#[test]
#[allow(clippy::cognitive_complexity)]
fn soa() {
let mut lexer = Lexer::new(
"@ IN SOA VENERA Action\\.domains (
\
20 ; SERIAL
7200 ; REFRESH
\
600 ; RETRY
3600000; EXPIRE
\
60) ; MINIMUM
NS A.ISI.EDU.
NS VENERA
\
NS VAXA
MX 10 VENERA
MX 20 VAXA
\
A A 26.3.0.103
VENERA A 10.1.0.52
A \
128.9.0.32
$INCLUDE <SUBSYS>ISI-MAILBOXES.TXT",
);
assert_eq!(next_token(&mut lexer).unwrap(), Token::At);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("IN".to_string())
);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("SOA".to_string())
);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("VENERA".to_string())
);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("Action\\.domains".to_string())
);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::List(vec![
"20".to_string(),
"7200".to_string(),
"600".to_string(),
"3600000".to_string(),
"60".to_string(),
])
);
assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("NS".to_string())
);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("A.ISI.EDU.".to_string())
);
assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("NS".to_string())
);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("VENERA".to_string())
);
assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("NS".to_string())
);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("VAXA".to_string())
);
assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("MX".to_string())
);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("10".to_string())
);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("VENERA".to_string())
);
assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("MX".to_string())
);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("20".to_string())
);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("VAXA".to_string())
);
assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("A".to_string())
);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("A".to_string())
);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("26.3.0.103".to_string())
);
assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("VENERA".to_string())
);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("A".to_string())
);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("10.1.0.52".to_string())
);
assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
assert_eq!(next_token(&mut lexer).unwrap(), Token::Blank);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("A".to_string())
);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("128.9.0.32".to_string())
);
assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
assert_eq!(next_token(&mut lexer).unwrap(), Token::EOL);
assert_eq!(next_token(&mut lexer).unwrap(), Token::Include);
assert_eq!(
next_token(&mut lexer).unwrap(),
Token::CharData("<SUBSYS>ISI-MAILBOXES.TXT".to_string())
);
assert!(next_token(&mut lexer).is_none());
}
}