use crate::error::{Error, ErrorKind, Result, Span};
use crate::lexer::cursor::Cursor;
#[derive(Clone, Debug, PartialEq)]
pub enum TomlTokenKind {
LeftBracket,
RightBracket,
DoubleLeftBracket,
DoubleRightBracket,
LeftBrace,
RightBrace,
Equals,
Comma,
Dot,
Newline,
BareKey(String),
String(String),
Integer(i64),
Float(f64),
Bool(bool),
Datetime(String),
Eof,
}
#[derive(Clone, Debug, PartialEq)]
pub struct TomlToken {
pub kind: TomlTokenKind,
pub span: Span,
}
impl TomlToken {
pub const fn new(kind: TomlTokenKind, span: Span) -> Self {
Self { kind, span }
}
}
#[derive(Clone, Debug)]
pub struct TomlLexer<'a> {
cursor: Cursor<'a>,
}
impl<'a> TomlLexer<'a> {
pub const fn new(input: &'a [u8]) -> Self {
Self {
cursor: Cursor::new(input),
}
}
pub fn next_token(&mut self) -> Result<TomlToken> {
self.skip_space();
let start = self.cursor.position();
let kind = match self.cursor.current() {
None => TomlTokenKind::Eof,
Some(b'\n') => {
self.cursor.advance();
TomlTokenKind::Newline
}
Some(b'#') => {
self.skip_comment();
return self.next_token();
}
Some(b'[') => {
if self.cursor.peek(1) == Some(b'[') {
self.cursor.advance_by(2);
TomlTokenKind::DoubleLeftBracket
} else {
self.cursor.advance();
TomlTokenKind::LeftBracket
}
}
Some(b']') => {
if self.cursor.peek(1) == Some(b']') {
self.cursor.advance_by(2);
TomlTokenKind::DoubleRightBracket
} else {
self.cursor.advance();
TomlTokenKind::RightBracket
}
}
Some(b'{') => {
self.cursor.advance();
TomlTokenKind::LeftBrace
}
Some(b'}') => {
self.cursor.advance();
TomlTokenKind::RightBrace
}
Some(b'=') => {
self.cursor.advance();
TomlTokenKind::Equals
}
Some(b',') => {
self.cursor.advance();
TomlTokenKind::Comma
}
Some(b'.') => {
self.cursor.advance();
TomlTokenKind::Dot
}
Some(b'"') => self.lex_basic_string()?,
Some(b'\'') => self.lex_literal_string()?,
Some(b'-') => {
if matches!(
self.cursor.peek(1),
Some(b'A'..=b'Z' | b'a'..=b'z' | b'_' | b'-')
) {
self.lex_bare_key_or_bool()?
} else {
self.lex_number_or_datetime()?
}
}
Some(b'+' | b'0'..=b'9') => self.lex_number_or_datetime()?,
Some(b'A'..=b'Z' | b'a'..=b'z' | b'_') => self.lex_bare_key_or_bool()?,
Some(_b) => {
return Err(Error::at(
ErrorKind::InvalidToken,
start.offset,
start.line,
start.col,
));
}
};
let end = self.cursor.position();
Ok(TomlToken::new(kind, Span::new(start, end)))
}
fn skip_space(&mut self) {
while let Some(b) = self.cursor.current() {
if matches!(b, b' ' | b'\t' | b'\r') {
self.cursor.advance();
} else {
break;
}
}
}
fn skip_comment(&mut self) {
while let Some(b) = self.cursor.current() {
self.cursor.advance();
if b == b'\n' {
break;
}
}
}
fn lex_basic_string(&mut self) -> Result<TomlTokenKind> {
if self.cursor.peek_bytes(3) == Some(b"\"\"\"") {
return self.lex_multiline_basic_string();
}
self.cursor.advance();
let mut result = String::new();
loop {
match self.cursor.current() {
None => {
return Err(Error::at(
ErrorKind::UnterminatedString,
self.cursor.position().offset,
self.cursor.position().line,
self.cursor.position().col,
));
}
Some(b'"') => {
self.cursor.advance();
break;
}
Some(b'\'') => {
self.cursor.advance();
result.push(self.lex_basic_escape()?);
}
Some(b'\n') => {
return Err(Error::at(
ErrorKind::UnterminatedString,
self.cursor.position().offset,
self.cursor.position().line,
self.cursor.position().col,
));
}
Some(b) => {
result.push(char::from(b));
self.cursor.advance();
}
}
}
Ok(TomlTokenKind::String(result))
}
fn lex_multiline_basic_string(&mut self) -> Result<TomlTokenKind> {
self.cursor.advance_by(3);
let mut result = String::new();
loop {
match self.cursor.current() {
None => {
return Err(Error::at(
ErrorKind::UnterminatedString,
self.cursor.position().offset,
self.cursor.position().line,
self.cursor.position().col,
));
}
Some(b'"') => {
if self.cursor.peek_bytes(3) == Some(b"\"\"\"") {
self.cursor.advance_by(3);
break;
}
result.push('"');
self.cursor.advance();
}
Some(b'\\') => {
self.cursor.advance();
result.push(self.lex_basic_escape()?);
}
Some(b) => {
result.push(char::from(b));
self.cursor.advance();
}
}
}
Ok(TomlTokenKind::String(result))
}
fn lex_basic_escape(&mut self) -> Result<char> {
match self.cursor.current() {
Some(b'"') => {
self.cursor.advance();
Ok('"')
}
Some(b'\\') => {
self.cursor.advance();
Ok('\\')
}
Some(b'n') => {
self.cursor.advance();
Ok('\n')
}
Some(b'r') => {
self.cursor.advance();
Ok('\r')
}
Some(b't') => {
self.cursor.advance();
Ok('\t')
}
Some(b'b') => {
self.cursor.advance();
Ok('\x08')
}
Some(b'f') => {
self.cursor.advance();
Ok('\x0C')
}
Some(b'u') => {
self.cursor.advance();
self.lex_unicode_escape(4)
}
Some(b'U') => {
self.cursor.advance();
self.lex_unicode_escape(8)
}
_ => Err(Error::at(
ErrorKind::InvalidEscapeSequence,
self.cursor.position().offset,
self.cursor.position().line,
self.cursor.position().col,
)),
}
}
fn lex_unicode_escape(&mut self, digits: usize) -> Result<char> {
let mut value: u32 = 0;
for _ in 0..digits {
let b = match self.cursor.current() {
Some(b) => b,
None => {
return Err(Error::at(
ErrorKind::InvalidUnicodeEscape,
self.cursor.position().offset,
self.cursor.position().line,
self.cursor.position().col,
));
}
};
let digit = match b {
b'0'..=b'9' => u32::from(b - b'0'),
b'a'..=b'f' => u32::from(b - b'a') + 10,
b'A'..=b'F' => u32::from(b - b'A') + 10,
_ => {
return Err(Error::at(
ErrorKind::InvalidUnicodeEscape,
self.cursor.position().offset,
self.cursor.position().line,
self.cursor.position().col,
));
}
};
value = value.saturating_mul(16).saturating_add(digit);
self.cursor.advance();
}
match char::from_u32(value) {
Some(ch) => Ok(ch),
None => Err(Error::at(
ErrorKind::InvalidUnicodeEscape,
self.cursor.position().offset,
self.cursor.position().line,
self.cursor.position().col,
)),
}
}
fn lex_literal_string(&mut self) -> Result<TomlTokenKind> {
if self.cursor.peek_bytes(3) == Some(b"'''") {
return self.lex_multiline_literal_string();
}
self.cursor.advance();
let mut result = String::new();
loop {
match self.cursor.current() {
None => {
return Err(Error::at(
ErrorKind::UnterminatedString,
self.cursor.position().offset,
self.cursor.position().line,
self.cursor.position().col,
));
}
Some(b'\'') => {
self.cursor.advance();
break;
}
Some(b'\n') => {
return Err(Error::at(
ErrorKind::UnterminatedString,
self.cursor.position().offset,
self.cursor.position().line,
self.cursor.position().col,
));
}
Some(b) => {
result.push(char::from(b));
self.cursor.advance();
}
}
}
Ok(TomlTokenKind::String(result))
}
fn lex_multiline_literal_string(&mut self) -> Result<TomlTokenKind> {
self.cursor.advance_by(3);
let mut result = String::new();
loop {
match self.cursor.current() {
None => {
return Err(Error::at(
ErrorKind::UnterminatedString,
self.cursor.position().offset,
self.cursor.position().line,
self.cursor.position().col,
));
}
Some(b'\'') => {
if self.cursor.peek_bytes(3) == Some(b"'''") {
self.cursor.advance_by(3);
break;
}
result.push('\'');
self.cursor.advance();
}
Some(b) => {
result.push(char::from(b));
self.cursor.advance();
}
}
}
Ok(TomlTokenKind::String(result))
}
fn lex_bare_key_or_bool(&mut self) -> Result<TomlTokenKind> {
let start = self.cursor.pos();
while let Some(b) = self.cursor.current() {
if matches!(b, b'A'..=b'Z' | b'a'..=b'z' | b'0'..=b'9' | b'_' | b'-') {
self.cursor.advance();
} else {
break;
}
}
let raw = self.cursor.slice_from(start);
let text = std::str::from_utf8(raw).map_err(|_| {
Error::at(
ErrorKind::InvalidToken,
self.cursor.position().offset,
self.cursor.position().line,
self.cursor.position().col,
)
})?;
match text {
"true" => Ok(TomlTokenKind::Bool(true)),
"false" => Ok(TomlTokenKind::Bool(false)),
_ => Ok(TomlTokenKind::BareKey(text.to_string())),
}
}
fn lex_number_or_datetime(&mut self) -> Result<TomlTokenKind> {
let start = self.cursor.pos();
if self.cursor.current() == Some(b'+') || self.cursor.current() == Some(b'-') {
self.cursor.advance();
}
while let Some(b) = self.cursor.current() {
if matches!(
b,
b'0'..=b'9' | b'_' | b'.' | b'e' | b'E' | b':' | b'T' | b'Z' | b'z' | b'-' | b'+'
) {
self.cursor.advance();
} else {
break;
}
}
let raw = self.cursor.slice_from(start);
let text = std::str::from_utf8(raw).map_err(|_| {
Error::at(
ErrorKind::InvalidToken,
self.cursor.position().offset,
self.cursor.position().line,
self.cursor.position().col,
)
})?;
if is_datetime_like(text) {
return Ok(TomlTokenKind::Datetime(text.to_string()));
}
let normalized = text.replace('_', "");
if let Some(float) = parse_special_float(&normalized) {
return Ok(TomlTokenKind::Float(float));
}
if normalized.contains(['.', 'e', 'E']) {
let value = normalized.parse::<f64>().map_err(|_| {
Error::at(
ErrorKind::InvalidNumber,
self.cursor.position().offset,
self.cursor.position().line,
self.cursor.position().col,
)
})?;
return Ok(TomlTokenKind::Float(value));
}
let (sign, digits) = match normalized.strip_prefix('-') {
Some(rest) => (-1_i64, rest),
None => match normalized.strip_prefix('+') {
Some(rest) => (1_i64, rest),
None => (1_i64, normalized.as_str()),
},
};
let (radix, digits) = if let Some(rest) = digits.strip_prefix("0x") {
(16_u32, rest)
} else if let Some(rest) = digits.strip_prefix("0o") {
(8_u32, rest)
} else if let Some(rest) = digits.strip_prefix("0b") {
(2_u32, rest)
} else {
(10_u32, digits)
};
let unsigned = i64::from_str_radix(digits, radix).map_err(|_| {
Error::at(
ErrorKind::InvalidNumber,
self.cursor.position().offset,
self.cursor.position().line,
self.cursor.position().col,
)
})?;
let value = unsigned.saturating_mul(sign);
Ok(TomlTokenKind::Integer(value))
}
}
fn parse_special_float(text: &str) -> Option<f64> {
match text {
"inf" | "+inf" => Some(f64::INFINITY),
"-inf" => Some(f64::NEG_INFINITY),
"nan" | "+nan" | "-nan" => Some(f64::NAN),
_ => None,
}
}
fn is_datetime_like(text: &str) -> bool {
if text.contains('T') || text.contains(':') || text.ends_with('Z') || text.ends_with('z') {
return true;
}
let mut dash_count = 0_u8;
for ch in text.chars() {
if ch == '-' {
dash_count = dash_count.saturating_add(1);
} else if !ch.is_ascii_digit() {
return false;
}
}
dash_count >= 2 && text.len() >= 8
}
#[cfg(test)]
mod tests {
use super::*;
use crate::error::Result;
#[test]
fn test_simple_tokens() -> Result<()> {
let input = b"[table]\nkey = 1\n";
let mut lexer = TomlLexer::new(input);
matches_token(&mut lexer, TomlTokenKind::LeftBracket)?;
matches_token(&mut lexer, TomlTokenKind::BareKey("table".to_string()))?;
matches_token(&mut lexer, TomlTokenKind::RightBracket)?;
matches_token(&mut lexer, TomlTokenKind::Newline)?;
matches_token(&mut lexer, TomlTokenKind::BareKey("key".to_string()))?;
matches_token(&mut lexer, TomlTokenKind::Equals)?;
matches_token(&mut lexer, TomlTokenKind::Integer(1))?;
matches_token(&mut lexer, TomlTokenKind::Newline)?;
Ok(())
}
#[test]
fn test_string_tokens() -> Result<()> {
let input = b"title = \"hello\"\nname = 'world'\n";
let mut lexer = TomlLexer::new(input);
matches_token(&mut lexer, TomlTokenKind::BareKey("title".to_string()))?;
matches_token(&mut lexer, TomlTokenKind::Equals)?;
matches_token(&mut lexer, TomlTokenKind::String("hello".to_string()))?;
matches_token(&mut lexer, TomlTokenKind::Newline)?;
matches_token(&mut lexer, TomlTokenKind::BareKey("name".to_string()))?;
matches_token(&mut lexer, TomlTokenKind::Equals)?;
matches_token(&mut lexer, TomlTokenKind::String("world".to_string()))?;
matches_token(&mut lexer, TomlTokenKind::Newline)?;
Ok(())
}
#[test]
fn test_numbers_and_bool() -> Result<()> {
let input = b"flag = true\nint = -42\nfloat = 3.5\n";
let mut lexer = TomlLexer::new(input);
matches_token(&mut lexer, TomlTokenKind::BareKey("flag".to_string()))?;
matches_token(&mut lexer, TomlTokenKind::Equals)?;
matches_token(&mut lexer, TomlTokenKind::Bool(true))?;
matches_token(&mut lexer, TomlTokenKind::Newline)?;
matches_token(&mut lexer, TomlTokenKind::BareKey("int".to_string()))?;
matches_token(&mut lexer, TomlTokenKind::Equals)?;
matches_token(&mut lexer, TomlTokenKind::Integer(-42))?;
matches_token(&mut lexer, TomlTokenKind::Newline)?;
matches_token(&mut lexer, TomlTokenKind::BareKey("float".to_string()))?;
matches_token(&mut lexer, TomlTokenKind::Equals)?;
matches_token(&mut lexer, TomlTokenKind::Float(3.5))?;
matches_token(&mut lexer, TomlTokenKind::Newline)?;
Ok(())
}
#[test]
fn test_array_table_tokens() -> Result<()> {
let input = b"[[products]]\nname = \"book\"\n";
let mut lexer = TomlLexer::new(input);
matches_token(&mut lexer, TomlTokenKind::DoubleLeftBracket)?;
matches_token(&mut lexer, TomlTokenKind::BareKey("products".to_string()))?;
matches_token(&mut lexer, TomlTokenKind::DoubleRightBracket)?;
matches_token(&mut lexer, TomlTokenKind::Newline)?;
matches_token(&mut lexer, TomlTokenKind::BareKey("name".to_string()))?;
matches_token(&mut lexer, TomlTokenKind::Equals)?;
matches_token(&mut lexer, TomlTokenKind::String("book".to_string()))?;
matches_token(&mut lexer, TomlTokenKind::Newline)?;
Ok(())
}
#[test]
fn test_datetime_token() -> Result<()> {
let input = b"date = 1979-05-27T07:32:00Z\n";
let mut lexer = TomlLexer::new(input);
matches_token(&mut lexer, TomlTokenKind::BareKey("date".to_string()))?;
matches_token(&mut lexer, TomlTokenKind::Equals)?;
matches_token(
&mut lexer,
TomlTokenKind::Datetime("1979-05-27T07:32:00Z".to_string()),
)?;
matches_token(&mut lexer, TomlTokenKind::Newline)?;
Ok(())
}
fn matches_token(lexer: &mut TomlLexer<'_>, expected: TomlTokenKind) -> Result<()> {
let token = lexer.next_token()?;
if token.kind != expected {
return Err(Error::with_message(
ErrorKind::InvalidToken,
token.span,
format!("expected {expected:?}, got {actual:?}", actual = token.kind),
));
}
Ok(())
}
}