use std::{
fmt::Display,
ops::{ControlFlow, Range},
};
#[derive(Clone, Debug, PartialEq)]
pub enum Token<'s> {
Amp,
AmpAmp,
AngleLeftEq,
AngleRightEq,
Arrow,
Bang,
BangEq,
Colon,
Comma,
Eq,
EqEq,
Hyphen,
Period,
Pipe,
PipePipe,
Plus,
QuestionMark,
SemiColon,
Slash,
Star,
StarStar,
Hat,
Underscore,
Percent,
ShiftLeft,
ShiftRight,
RotLeft,
RotRight,
RoundLeft,
RoundRight,
Integer(&'s str),
Hex(&'s str),
Binary(&'s str),
History(&'s str),
}
pub struct Lexer<'a> {
input: &'a str,
original_length: usize,
peeked: Option<(Result<Token<'a>, ()>, Range<usize>)>,
}
impl<'a> Lexer<'a> {
pub fn next(&mut self) -> Option<(Result<Token<'a>, ()>, Range<usize>)> {
if self.peeked.is_some() {
return self.peeked.take();
}
self.next_inner()
}
pub fn peek(&mut self) -> &Option<(Result<Token<'a>, ()>, Range<usize>)> {
if self.peeked.is_none() {
self.peeked = self.next_inner();
}
&self.peeked
}
fn next_inner(&mut self) -> Option<(Result<Token<'a>, ()>, Range<usize>)> {
match self.next_token() {
ControlFlow::Continue(()) => {
if self.input.is_empty() {
None
} else {
let start = self.original_length - self.input.len();
let end = start + 1;
Some((Err(()), start..end))
}
}
ControlFlow::Break((tok, span)) => Some((Ok(tok), span)),
}
}
}
impl<'s> Lexer<'s> {
pub fn new(input: &'s str) -> Self {
Self {
input,
original_length: input.len(),
peeked: None,
}
}
fn bump(&mut self, n: usize) -> (&'s str, Range<usize>) {
let start = self.original_length - self.input.len();
let (a, b) = self.input.split_at(n);
self.input = b;
let end = self.original_length - self.input.len();
(a, start..end)
}
fn is_empty(&self) -> bool {
self.input.is_empty()
}
fn next_token(&mut self) -> ControlFlow<(Token<'s>, Range<usize>)> {
self.skip_whitespace();
if self.is_empty() {
return ControlFlow::Continue(());
}
self.history()?;
self.two_char_punctuation()?;
self.one_char_punctuation()?;
self.hex_number()?;
self.bin_number()?;
self.integer()?;
self.words()?;
ControlFlow::Continue(())
}
fn skip_whitespace(&mut self) {
loop {
self.input = self.input.trim_start();
if self.input.as_bytes().first() == Some(&b'#') {
let n = self.input.find('\n').unwrap_or(self.input.len());
self.bump(n);
} else {
return;
}
}
}
fn two_char_punctuation(&mut self) -> ControlFlow<(Token<'s>, Range<usize>)> {
let Some(x) = self.input.as_bytes().first_chunk() else {
return ControlFlow::Continue(());
};
let tok = match *x {
[b'=', b'='] => Token::EqEq,
[b'!', b'='] => Token::BangEq,
[b'&', b'&'] => Token::AmpAmp,
[b'|', b'|'] => Token::PipePipe,
[b'>', b'='] => Token::AngleRightEq,
[b'<', b'='] => Token::AngleLeftEq,
[b'-', b'>'] => Token::Arrow,
[b'<', b'<'] => Token::ShiftLeft,
[b'>', b'>'] => Token::ShiftRight,
[b'*', b'*'] => Token::StarStar,
_ => return ControlFlow::Continue(()),
};
let (_, span) = self.bump(2);
ControlFlow::Break((tok, span))
}
fn one_char_punctuation(&mut self) -> ControlFlow<(Token<'s>, Range<usize>)> {
let Some(x) = self.input.as_bytes().first() else {
return ControlFlow::Continue(());
};
let tok = match x {
b'&' => Token::Amp,
b'=' => Token::Eq,
b'|' => Token::Pipe,
b'-' => Token::Hyphen,
b':' => Token::Colon,
b';' => Token::SemiColon,
b',' => Token::Comma,
b'.' => Token::Period,
b'+' => Token::Plus,
b'*' => Token::Star,
b'/' => Token::Slash,
b'!' => Token::Bang,
b'?' => Token::QuestionMark,
b'(' => Token::RoundLeft,
b')' => Token::RoundRight,
b'^' => Token::Hat,
b'%' => Token::Percent,
b'_' => Token::Underscore,
_ => return ControlFlow::Continue(()),
};
let (_, span) = self.bump(1);
ControlFlow::Break((tok, span))
}
fn hex_number(&mut self) -> ControlFlow<(Token<'s>, Range<usize>)> {
let Some(rest) = self.input.strip_prefix("0x") else {
return ControlFlow::Continue(());
};
let digit_idx = rest
.find(|c: char| !c.is_ascii_hexdigit() && c != '_')
.unwrap_or(rest.len());
let (tok, span) = self.bump(2 + digit_idx);
ControlFlow::Break((Token::Hex(tok), span))
}
fn bin_number(&mut self) -> ControlFlow<(Token<'s>, Range<usize>)> {
let Some(rest) = self.input.strip_prefix("0b") else {
return ControlFlow::Continue(());
};
let digit_idx = rest
.find(|c: char| !['1', '0', '_'].contains(&c))
.unwrap_or(rest.len());
let (tok, span) = self.bump(2 + digit_idx);
ControlFlow::Break((Token::Binary(tok), span))
}
fn integer(&mut self) -> ControlFlow<(Token<'s>, Range<usize>)> {
let non_numeric_idx = self
.input
.find(|c: char| !c.is_ascii_digit() && c != '_')
.unwrap_or(self.input.len());
if non_numeric_idx == 0 {
return ControlFlow::Continue(());
}
let (tok, span) = self.bump(non_numeric_idx);
ControlFlow::Break((Token::Integer(tok), span))
}
fn history(&mut self) -> ControlFlow<(Token<'s>, Range<usize>)> {
let Some(rest) = self.input.strip_prefix("_") else {
return ControlFlow::Continue(());
};
let non_numeric_idx = rest
.find(|c: char| !c.is_ascii_digit())
.unwrap_or(rest.len());
if non_numeric_idx == 0 {
return ControlFlow::Continue(());
}
let (tok, span) = self.bump(non_numeric_idx + 1);
ControlFlow::Break((Token::History(tok), span))
}
fn words(&mut self) -> ControlFlow<(Token<'s>, Range<usize>)> {
let non_letter_idx = self
.input
.find(|c: char| !c.is_ascii_alphabetic())
.unwrap_or(self.input.len());
if non_letter_idx == 0 {
return ControlFlow::Continue(());
}
let s = &self.input[..non_letter_idx];
let tok = match s {
"rotl" => Token::RotLeft,
"rotr" => Token::RotRight,
_ => return ControlFlow::Continue(()),
};
let (_tok, span) = self.bump(non_letter_idx);
ControlFlow::Break((tok, span))
}
}
impl Display for Token<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
let s = match self {
Token::Amp => "&",
Token::AmpAmp => "&&",
Token::AngleLeftEq => "<=",
Token::AngleRightEq => ">=",
Token::Arrow => "->",
Token::Bang => "!",
Token::BangEq => "!=",
Token::Colon => ":",
Token::Comma => ",",
Token::Eq => "=",
Token::EqEq => "==",
Token::Hyphen => "-",
Token::Period => ".",
Token::Pipe => "|",
Token::PipePipe => "||",
Token::Plus => "+",
Token::QuestionMark => "?",
Token::SemiColon => ";",
Token::Slash => "/",
Token::Star => "*",
Token::StarStar => "**",
Token::Hat => "^",
Token::Underscore => "_",
Token::Percent => "%",
Token::ShiftLeft => "<<",
Token::ShiftRight => ">>",
Token::RotLeft => "rotl",
Token::RotRight => "rotr",
Token::RoundLeft => "(",
Token::RoundRight => ")",
Token::Integer(s) => s,
Token::Hex(s) => s,
Token::Binary(s) => s,
Token::History(s) => s,
};
f.write_str(s)
}
}