use num_rational::ParseRatioError;
use std::num::ParseIntError;
use std::str::CharIndices;
pub type Spanned<Tok, Pos, Error> = Result<(Pos, Tok, Pos), Error>;
pub struct Lexer<'input> {
input: &'input str,
chars: CharIndices<'input>,
state: LexerState,
}
#[derive(Debug, Clone, PartialEq)]
pub enum LexicalError {
UnexpectedCharacter(usize, char),
UnexpectedNewline(usize, LexerState),
UnexpectedEOF(usize, LexerState),
ParseIntError(ParseIntError, usize, usize),
ParseRatioError(ParseRatioError, usize, usize),
}
#[derive(Debug, Clone, PartialEq)]
pub enum LexTok<'input> {
Newline,
Dot,
Star,
Minus,
Percent,
String(&'input str),
InlineComment(&'input str),
Comment(&'input str),
Integer(&'input str),
Letters(&'input str),
Whitespace(&'input str),
}
impl<'input> LexTok<'input> {
pub fn lalrpop_name_for_display(lalrpop_name: &str) -> Option<&'static str> {
match lalrpop_name {
"TokNewline" => r#"\n"#.into(),
"TokDot" => ".".into(),
"TokStar" => "*".into(),
"TokMinus" => "-".into(),
"TokPercent" => "%".into(),
"TokString" => r#"/"[^"]*(""[^"]*)*"/"#.into(),
"TokInlineComment" => r#"/\(.*?\)/"#.into(),
"TokComment" => r#"/;[^\n]*/"#.into(),
"TokInteger" => "/[0-9]+/".into(),
"TokLetters" => "/[a-zA-Z]+/".into(),
"TokWhitespace" => r#"/[^\S\n]+/"#.into(),
_ => None,
}
}
}
#[derive(Debug, Clone, PartialEq)]
pub enum LexerState {
Init,
Newline(usize),
Dot(usize),
Star(usize),
Minus(usize),
Percent(usize),
String {
start: usize,
prev_could_be_escaped_quote: bool,
},
InlineComment(usize),
Comment(usize),
Integer(usize),
Letters(usize),
Whitespace(usize),
}
impl Default for LexerState {
fn default() -> Self {
Self::Init
}
}
impl<'input> Lexer<'input> {
pub fn new(input: &'input str) -> Self {
Lexer {
input,
chars: input.char_indices(),
state: Default::default(),
}
}
}
impl<'input> Iterator for Lexer<'input> {
type Item = Spanned<LexTok<'input>, usize, LexicalError>;
fn next(&mut self) -> Option<Self::Item> {
use LexerState::*;
use LexicalError::*;
loop {
match self.state {
Init => {
match self.chars.next() {
Some((pos, '\n')) => self.state = Newline(pos),
Some((pos, '"')) => {
self.state = String {
start: pos,
prev_could_be_escaped_quote: false,
}
}
Some((pos, '(')) => self.state = InlineComment(pos),
Some((pos, ';')) => self.state = Comment(pos),
Some((pos, '.')) => self.state = Dot(pos),
Some((pos, '*')) => self.state = Star(pos),
Some((pos, '-')) => self.state = Minus(pos),
Some((pos, '%')) => self.state = Percent(pos),
Some((pos, digit)) if digit.is_ascii_digit() => self.state = Integer(pos),
Some((pos, letter)) if letter.is_ascii_alphabetic() => {
self.state = Letters(pos)
}
Some((pos, whitespace))
if whitespace.is_ascii_whitespace() && whitespace != '\n' =>
{
self.state = Whitespace(pos)
}
Some((pos, c)) => return Some(Err(UnexpectedCharacter(pos, c))),
None => return None,
};
}
Newline(pos) | Dot(pos) | Star(pos) | Minus(pos) | Percent(pos) => {
let prev_state = self.state.clone();
self.state = Init;
return Some(Ok((
pos,
match prev_state {
Newline(_) => LexTok::Newline,
Dot(_) => LexTok::Dot,
Star(_) => LexTok::Star,
Minus(_) => LexTok::Minus,
Percent(_) => LexTok::Percent,
_ => unreachable!(),
},
pos + 1,
)));
}
String {
start,
prev_could_be_escaped_quote,
} => match self.chars.next() {
Some((_, '"')) => {
if !prev_could_be_escaped_quote {
self.state = String {
start,
prev_could_be_escaped_quote: true,
}
} else {
self.state = String {
start,
prev_could_be_escaped_quote: false,
}
}
}
Some((end, c)) if c.is_ascii() => {
if prev_could_be_escaped_quote {
self.state = Init;
return Some(Ok((
start,
LexTok::String(self.input.get(start..end).unwrap()),
end,
)));
}
}
Some((pos, other)) => return Some(Err(UnexpectedCharacter(pos, other))),
None => {
if prev_could_be_escaped_quote {
self.state = Init;
return Some(Ok((
start,
LexTok::String(self.input.get(start..).unwrap()),
self.input.len(),
)));
} else {
return Some(Err(UnexpectedEOF(
self.input.len().saturating_sub(1),
self.state.clone(),
)));
}
}
},
InlineComment(start) => match self.chars.next() {
Some((pos, '\n')) => {
return Some(Err(UnexpectedNewline(pos, self.state.clone())))
}
Some((end, ')')) => {
self.state = Init;
return Some(Ok((
start,
LexTok::InlineComment(self.input.get(start..=end).unwrap()),
end + 1,
)));
}
Some((_, c)) if c.is_ascii() => {}
Some((pos, other)) => return Some(Err(UnexpectedCharacter(pos, other))),
None => {
return Some(Err(UnexpectedEOF(
self.input.len().saturating_sub(1),
self.state.clone(),
)))
}
},
Comment(start) => match self.chars.next() {
None => {
self.state = Init;
return Some(Ok((
start,
LexTok::Comment(self.input.get(start..).unwrap()),
self.input.len(),
)));
}
Some((end, '\n')) => {
self.state = Newline(end);
return Some(Ok((
start,
LexTok::Comment(self.input.get(start..end).unwrap()),
end,
)));
}
Some((_, c)) if c.is_ascii() => {}
Some((pos, other)) => return Some(Err(UnexpectedCharacter(pos, other))),
},
Integer(start) | Letters(start) | Whitespace(start) => {
let original_state = self.state.clone();
let output = match self.chars.next() {
None => {
self.state = Init;
Some((start, self.input.len()))
}
Some((end, '\n')) => {
self.state = Newline(end);
Some((start, end))
}
Some((end, '.')) => {
self.state = Dot(end);
Some((start, end))
}
Some((end, '*')) => {
self.state = Star(end);
Some((start, end))
}
Some((end, '-')) => {
self.state = Minus(end);
Some((start, end))
}
Some((end, '"')) => {
self.state = String {
start: end,
prev_could_be_escaped_quote: false,
};
Some((start, end))
}
Some((end, '(')) => {
self.state = InlineComment(end);
Some((start, end))
}
Some((end, ';')) => {
self.state = Comment(end);
Some((start, end))
}
Some((end, '%')) => {
self.state = Percent(end);
Some((start, end))
}
Some((pos, non_ascii)) if !non_ascii.is_ascii() => {
return Some(Err(UnexpectedCharacter(pos, non_ascii)));
}
Some((end, other)) => {
if !other.is_ascii_digit()
&& !other.is_ascii_alphabetic()
&& !other.is_ascii_whitespace()
{
return Some(Err(UnexpectedCharacter(end, other)));
}
if let Letters(_) | Integer(_) = &original_state {
if other.is_ascii_whitespace() && other != '\n' {
self.state = Whitespace(end);
Some((start, end))
} else {
None
}
} else {
None
}
.or(if let Whitespace(_) | Integer(_) = &original_state {
if other.is_ascii_alphabetic() {
self.state = Letters(end);
Some((start, end))
} else {
None
}
} else {
None
})
.or(
if let Whitespace(_) | Letters(_) = &original_state {
if other.is_ascii_digit() {
self.state = Integer(end);
Some((start, end))
} else {
None
}
} else {
None
},
)
}
};
if let Some((start, end)) = output {
match original_state {
Whitespace(_) => {
return Some(Ok((
start,
LexTok::Whitespace(self.input.get(start..end).unwrap()),
end,
)))
}
Letters(_) => {
return Some(Ok((
start,
LexTok::Letters(self.input.get(start..end).unwrap()),
end,
)))
}
Integer(_) => {
return Some(Ok((
start,
LexTok::Integer(self.input.get(start..end).unwrap()),
end,
)))
}
_ => unreachable!(),
}
}
}
}
}
}
}