use std::{iter::Peekable, str::Chars};
#[cfg(test)]
mod test;
pub mod token;
pub use token::*;
pub struct Lexer<'src> {
source: Peekable<Chars<'src>>,
current_char: Option<char>,
peek_char: Option<char>,
}
impl<'src> Lexer<'src> {
pub fn new(source_str: &'src str) -> Self {
let mut source = source_str.chars().peekable();
Self {
current_char: source.next(),
peek_char: source.peek().copied(),
source,
}
}
}
impl Lexer<'_> {
fn read_char(&mut self) {
self.current_char = self.source.next();
self.peek_char = self.source.peek().copied();
}
fn skip_whitespace(&mut self) {
while let Some(ch) = self.current_char
&& ch.is_ascii_whitespace()
{
self.read_char();
}
}
fn skip_comments(&mut self) {
self.skip_whitespace();
match self.current_char {
Some('/') if matches!(self.peek_char, Some('/')) => {
loop {
self.read_char();
if let Some('\n') = self.current_char {
break;
}
}
self.read_char(); }
Some('/') if matches!(self.peek_char, Some('*')) => {
loop {
self.read_char();
if matches!(self.current_char, Some('*')) && matches!(self.peek_char, Some('/'))
{
break;
}
}
self.read_char(); self.read_char(); }
_ => {}
}
self.skip_whitespace();
}
fn number_token(&mut self) -> Token {
let mut result = String::new();
let mut is_float = false;
let mut is_hex = false;
let mut is_octal = false;
let mut is_binary = false;
if let Some('0') = self.current_char
&& matches!(self.peek_char, Some('b' | 'o' | 'x'))
{
match self.peek_char.unwrap() {
'b' => is_binary = true,
'o' => is_octal = true,
'x' => is_hex = true,
_ => {}
}
result.push(self.current_char.unwrap());
result.push(self.peek_char.unwrap());
self.read_char();
self.read_char();
}
while let Some(ch) = self.current_char
&& (ch.is_ascii_hexdigit() || ch == '_' || ch == '.')
{
if ch == '.' {
if is_binary || is_octal || is_hex {
break;
}
if is_float {
break;
} else {
is_float = true;
}
}
result.push(ch);
self.read_char();
}
let lit = Literal::from(result);
if is_float {
Token::Float(lit)
} else {
Token::Int(lit)
}
}
fn ident_token(&mut self) -> Token {
let mut literal = String::new();
while let Some(current_ch) = self.current_char
&& (current_ch.is_alphanumeric() || current_ch == '_')
{
literal.push(current_ch);
self.read_char();
}
Token::keyword(&literal)
}
fn string_token(&mut self) -> Token {
let mut result = String::new();
self.read_char();
while let Some(ch) = self.current_char {
if ch == '"' {
self.read_char();
break;
}
if ch == '\\' {
if let Some(escaped_ch) = self.read_escape() {
result.push(escaped_ch)
}
} else {
result.push(ch)
}
self.read_char();
}
Token::String(Literal::from(result))
}
fn char_token(&mut self) -> Token {
let mut result = String::new();
self.read_char();
if let Some(ch) = self.current_char {
if ch == '\\' {
if let Some(escaped_ch) = self.read_escape() {
self.read_char();
result.push(escaped_ch);
}
} else {
result.push(ch);
self.read_char();
if let Some(tick ) = self.current_char
&& tick != '\''
{
return Token::Illegal(Literal::from(result));
}
}
}
self.read_char(); Token::Char(Literal::from(result))
}
fn read_escape(&mut self) -> Option<char> {
self.read_char();
match self.current_char? {
'e' => Some('\x1b'),
'n' => Some('\n'),
't' => Some('\t'),
'r' => Some('\r'),
'\\' => Some('\\'),
'"' => Some('"'),
'\'' => Some('\''),
'x' => {
let mut hex_str = String::with_capacity(2);
self.read_char(); if let Some(c) = self.current_char
&& c.is_ascii_hexdigit()
{
hex_str.push(c);
}
self.read_char(); if let Some(c) = self.current_char
&& c.is_ascii_hexdigit()
{
hex_str.push(c);
}
u8::from_str_radix(&hex_str, 16).ok().map(|b| b as char)
}
'u' => {
self.read_char(); if self.current_char != Some('{') {
return None;
}
self.read_char();
let mut unicode = String::new();
while self.current_char != Some('}') && self.current_char.is_some() {
unicode.push(self.current_char.unwrap());
self.read_char();
}
u32::from_str_radix(&unicode, 16)
.ok()
.and_then(char::from_u32)
}
other => Some(other), }
}
fn parse_token(&mut self) -> Token {
macro_rules! token {
( $token_ty_default: ident [ $( $ch: expr => $token_ty: ident ),* ] ) => {
match self.peek_char {
$(
Some($ch) => {
self.read_char(); self.read_char(); Token::$token_ty
}
)*
_ => token!($token_ty_default)
}
};
( $token_ty: ident ) => {{
self.read_char();
Token::$token_ty
}};
}
match self.current_char {
Some(ch) => match ch {
'\n' => token!(NewLine),
'=' => token!(Assign ['=' => Equal, '>' => FatArrow]),
'!' => token!(Bang ['=' => NotEqual]),
'|' => token!(Pipe ['|' => Or]),
'&' => token!(Ampersant ['&' => And]),
'^' => token!(Caret),
'~' => token!(Tilde),
'@' => token!(AtSign),
'#' => token!(Hash),
'?' => token!(QuestionMark),
'+' => token!(Plus ['+' => Increase]),
'-' => token!(Minus ['>' => ThinArrow, '-' => Decrease]),
'/' => token!(Slash),
'*' => token!(Asterisk),
'%' => token!(Percent),
'>' => token!(GreaterThan ['=' => GreatEqual, '>' => ShiftRight]),
'<' => token!(LessThan ['=' => LessEqual, '<' => ShiftLeft]),
',' => token!(Comma),
'.' => token!(Dot ['.' => TwoDots]),
';' => token!(SemiColon),
':' => token!(Colon [':' => Namespace]),
'(' => token!(LParen),
')' => token!(RParen),
'[' => token!(LBracket),
']' => token!(RBracket),
'{' => token!(LBrace),
'}' => token!(RBrace),
c => {
self.read_char();
Token::Illegal(Literal::from(c.to_string()))
}
},
None => Token::EOF,
}
}
pub fn next_token(&mut self) -> Token {
self.skip_whitespace();
match self.current_char {
Some(ch) if ch.is_alphabetic() || ch == '_' => self.ident_token(),
Some(ch) if ch.is_ascii_digit() => self.number_token(),
Some('/') if matches!(self.peek_char, Some('/') | Some('*')) => {
self.skip_comments();
self.next_token()
}
Some('\'') => self.char_token(),
Some('"') => self.string_token(),
Some(_) => self.parse_token(),
None => Token::EOF,
}
}
}
impl Iterator for Lexer<'_> {
type Item = Token;
fn next(&mut self) -> Option<Self::Item> {
match self.next_token() {
Token::EOF => None,
t => Some(t),
}
}
}