use std::fmt;
use crate::error::{Expected, LexerError};
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub(crate) enum TokenKind {
Ident,
Punct,
Integer,
Float,
String,
Char,
Eof,
}
#[derive(Copy, Clone, Debug)]
pub(crate) struct Token<'de> {
pub kind: TokenKind,
pub value: &'de str,
}
impl<'de> Token<'de> {
pub fn is_punct(&self, punct: &str) -> bool {
self.kind == TokenKind::Punct && self.value == punct
}
}
impl fmt::Display for TokenKind {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.write_str(match self {
Self::String => "a string",
Self::Char => "a character literal",
Self::Integer => "an integer",
Self::Float => "a floating-point number",
Self::Punct => "a punctuation token",
Self::Ident => "an identifier",
Self::Eof => "end-of-file",
})
}
}
#[derive(Clone, Debug)]
pub(crate) struct Lexer<'de> {
data: &'de str,
}
impl<'de> Lexer<'de> {
pub fn new(data: &'de str) -> Self {
Self { data }
}
fn skip_whitespace(&mut self) {
self.data = self.data.trim_start();
}
fn advance(&mut self, bytes: usize) {
self.data = self.data.split_at(bytes).1;
}
fn take_char(&mut self) -> Option<char> {
let (index, c) = self.data.char_indices().next()?;
self.advance(index + c.len_utf8());
Some(c)
}
fn peek_char(&mut self) -> Option<char> {
self.data.chars().next()
}
fn unexpected_token(&self, expected: impl Into<Expected>) -> LexerError {
LexerError::unexpected_token(first_char(self.data), expected)
}
fn unexpected_eof(&self, expected: impl Into<Expected>) -> LexerError {
LexerError::unexpected_token(&self.data[self.data.len()..], expected)
}
fn try_parse<F, T>(&mut self, func: F) -> Result<T, LexerError>
where
F: FnOnce(&mut Self) -> Result<T, LexerError>,
{
let mut copy = self.clone();
let result = func(&mut copy);
if result.is_ok() {
*self = copy;
}
result
}
fn parse_consumed<F>(&mut self, func: F) -> Result<Token<'de>, LexerError>
where
F: FnOnce(&mut Self) -> Result<TokenKind, LexerError>,
{
let copy = self.data;
let kind = self.try_parse(func)?;
let start = copy.as_ptr();
let end = copy.as_ptr().wrapping_add(copy.len());
assert!((start..=end).contains(&self.data.as_ptr()));
let offset = self.data.as_ptr() as usize - start as usize;
Ok(Token {
kind,
value: ©[..offset],
})
}
pub fn parse_token(&mut self) -> Result<Token<'de>, LexerError> {
self.skip_whitespace();
self.parse_consumed(|this| match this.peek_char() {
None => Ok(TokenKind::Eof),
Some('\"') => this.parse_string(),
Some('\'') => this.parse_char(),
Some('0'..='9') => this.parse_number(),
Some(c) if unicode_ident::is_xid_start(c) => this.parse_ident(),
Some('.') => this.parse_dotdot(),
Some('{' | '}' | '[' | ']' | ':' | ',' | '(' | ')' | '+' | '-') => {
this.advance(1);
Ok(TokenKind::Punct)
}
Some(_) => Err(this.unexpected_token("a valid token")),
})
}
fn parse_string(&mut self) -> Result<TokenKind, LexerError> {
self.data = match self.data.strip_prefix('"') {
Some(rest) => rest,
None => return Err(self.unexpected_token(TokenKind::String)),
};
while let Some(idx) = self.data.find(['\"', '\\']) {
let byte = self.data.as_bytes()[idx];
self.advance(idx);
if byte == b'\\' {
if self.data.len() < 2 {
return Err(self.unexpected_eof("a string literal"));
}
self.advance(2);
continue;
}
break;
}
match self.data.as_bytes().first() {
Some(b'\"') => {
self.advance(1);
Ok(TokenKind::String)
}
_ => Err(LexerError::unexpected_token(self.data, TokenKind::String)),
}
}
fn parse_char(&mut self) -> Result<TokenKind, LexerError> {
self.data = match self.data.strip_prefix('\'') {
Some(rest) => rest,
None => return Err(self.unexpected_token(TokenKind::Char)),
};
while let Some(idx) = self.data.find(['\'', '\\']) {
let byte = self.data.as_bytes()[idx];
self.advance(idx);
if byte == b'\\' {
if self.data.len() < 2 {
return Err(self.unexpected_eof("a character literal"));
}
self.advance(2);
continue;
}
break;
}
match self.data.as_bytes().first() {
Some(b'\'') => {
self.advance(1);
Ok(TokenKind::Char)
}
_ => Err(LexerError::unexpected_token(self.data, TokenKind::Char)),
}
}
fn parse_ident(&mut self) -> Result<TokenKind, LexerError> {
match self.data.chars().next() {
Some(c) if unicode_ident::is_xid_start(c) => (),
Some(_) => return Err(self.unexpected_token(TokenKind::Ident)),
None => return Err(self.unexpected_eof(TokenKind::Ident)),
};
let index = self
.data
.char_indices()
.skip(1)
.find(|&(_, c)| !unicode_ident::is_xid_continue(c))
.map(|(idx, _)| idx)
.unwrap_or(self.data.len());
self.advance(index);
Ok(TokenKind::Ident)
}
fn parse_number(&mut self) -> Result<TokenKind, LexerError> {
match self.take_char() {
Some('0') => {
if matches!(self.peek_char(), Some('x' | 'X' | 'o' | 'O' | 'b' | 'B')) {
self.advance(1);
self.parse_once(TokenKind::Integer, |c| c.is_ascii_hexdigit())?;
self.parse_repeated(|c| c.is_ascii_hexdigit());
return Ok(TokenKind::Integer);
}
}
Some('1'..='9') => (),
Some(_) => return Err(self.unexpected_token("a number")),
None => return Err(self.unexpected_eof("a number")),
}
self.parse_repeated(|c| c.is_ascii_digit());
match self.peek_char() {
Some('.' | 'e' | 'E') => (),
_ => return Ok(TokenKind::Integer),
}
if matches!(self.peek_char(), Some('.')) {
self.advance(1);
self.parse_once(TokenKind::Float, |c| c.is_ascii_digit())?;
self.parse_repeated(|c| c.is_ascii_digit());
}
if matches!(self.peek_char(), Some('e' | 'E')) {
self.advance(1);
if matches!(self.peek_char(), Some('+' | '-')) {
self.advance(1);
}
self.parse_once(TokenKind::Float, |c| c.is_ascii_digit())?;
self.parse_repeated(|c| c.is_ascii_digit());
}
Ok(TokenKind::Float)
}
fn parse_dotdot(&mut self) -> Result<TokenKind, LexerError> {
self.parse_once("..", |c| c == '.')?;
self.parse_once("..", |c| c == '.')?;
Ok(TokenKind::Punct)
}
fn parse_once<F>(&mut self, expected: impl Into<Expected>, pred: F) -> Result<(), LexerError>
where
F: FnOnce(char) -> bool,
{
match self.data.chars().next() {
Some(c) if pred(c) => {
self.advance(c.len_utf8());
Ok(())
}
Some(_) => Err(LexerError::unexpected_token(
first_char(self.data),
expected,
)),
None => Err(LexerError::unexpected_eof(expected)),
}
}
fn parse_repeated<F>(&mut self, mut pred: F)
where
F: FnMut(char) -> bool,
{
let index = self
.data
.char_indices()
.find(|&(_, c)| !pred(c))
.map(|(idx, _)| idx)
.unwrap_or(self.data.len());
self.advance(index);
}
}
fn first_char(s: &str) -> &str {
match s.chars().next() {
Some(c) => &s[..c.len_utf8()],
None => s,
}
}