ucifer 0.3.0 - Docs.rs

/* Copyright © 2025 CZ.NIC z.s.p.o. (http://www.nic.cz/)
 *
 * This file is part of the ucifer library
 *
 * Ucifer is free software: you can redistribute it and/or modify it under
 * the terms of the GNU General Public License as published by the Free
 * Software Foundation, either version 3 of the License, or (at your option)
 * any later version.
 *
 * Ucifer is distributed in the hope that it will be useful, but WITHOUT ANY
 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
 * details.
 *
 * For more information, see /LICENSE.txt
 */

//! Contains token and lexing definition for the file format
//!
//! Currently based on [`logos`].
//!
//! Token is an implementation detail of the parser and should not be exposed as
//! public API.

use {crate::CowStr, alloc::borrow::Cow, descape::InvalidEscape, logos::Logos};

/// Specific [`Lexer`] type instance for UCI
pub type Lexer<'a> = logos::Lexer<'a, Token<'a>>;

/// Result with module-local error type default
type Result<T, E = Error> = core::result::Result<T, E>;

/// A unit of source code
#[derive(Clone, Debug, PartialEq, Eq, Logos)]
#[logos(error = Error)]
#[logos(skip r"[ \t\f]+|[#;].*")]
pub enum Token<'a> {
	/// Identifier-like string
	///
	/// As non-quoted strings can't contain escapes, this is always just a
	/// borrow.
	#[regex("[0-9A-Za-z_-]+")]
	UnquotedString(&'a str),

	/// String quoted in either `'` or `"`
	#[regex(r"'[^']*'", single_quoted)]
	#[regex(r#""([^"\\]|\\.)*""#, double_quoted)]
	QuotedString(CowStr<'a>),

	/// Newline a directive separator
	#[token("\n")]
	LineBreak,
}

impl<'a> Token<'a> {
	/// Get internaly [`Cow`] string, no matter if quoted or unquoted.
	pub fn into_cow_str(self) -> CowStr<'a> {
		match self {
			Self::UnquotedString(s) => Cow::Borrowed(s),
			Self::QuotedString(cow) => cow,
			Self::LineBreak => Cow::Borrowed("\n"),
		}
	}
}

/// Describes lexing error
#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
pub enum Error {
	/// Invalid string escape
	Escape(InvalidEscape),
	/// We were inside a pattern but didn't get finished
	UnexpectedEndOfPattern,
	/// Lexer failed to lex token, may be end of input
	#[default]
	InvalidToken,
}

/// Single quoted strings just cut the quotation marks, do not support any kind
/// of escaping.
fn single_quoted<'a>(lexer: &Lexer<'a>) -> Result<CowStr<'a>> {
	get_without_surrounding_chars(lexer).map(Cow::Borrowed)
}

/// Double quoted strings support escapes. This returns in-[`Some`] wrapped
/// [`Cow::Owned`] if escaping was required, otherwise keep [`Cow::Borrowed`]
/// as-is.
fn double_quoted<'a>(lexer: &Lexer<'a>) -> Result<CowStr<'a>> {
	let slice = get_without_surrounding_chars(lexer)?;
	descape::UnescapeExt::to_unescaped(slice).map_err(Error::Escape)
}

/// Remove surrounding characters from current lexeme, usually a quotation mark.
fn get_without_surrounding_chars<'a>(lexer: &Lexer<'a>) -> Result<&'a str> {
	let slice = lexer.slice();
	slice
		.get(1..slice.len() - 1)
		.ok_or(Error::UnexpectedEndOfPattern)
}

#[cfg(test)]
mod tests {
	use super::*;

	impl Token<'_> {
		fn as_str(&self) -> &'_ str {
			match self {
				Token::UnquotedString(s) => s,
				Token::QuotedString(cow) => cow,
				Token::LineBreak => "\n",
			}
		}
	}

	#[test]
	fn lex_unquoted() {
		assert_eq!(lex_one_token("simple_ident").as_str(), "simple_ident");
		assert_eq!(lex_one_token("133t").as_str(), "133t");
	}

	#[test]
	fn lex_quoted() {
		assert_eq!(lex_one_token("'single quoted'").as_str(), "single quoted");
		assert_eq!(
			lex_one_token(r#""doubly quoted""#).as_str(),
			"doubly quoted"
		);
	}

	#[test]
	fn lex_quoted_escapes() {
		assert_eq!(
			lex_one_token(r#""string's cake""#).as_str(),
			"string's cake"
		);
	}

	#[test]
	fn comments() {
		let input = "#comment\n#comment";
		let mut lexer = Token::lexer(input);

		assert_eq!(lexer.next(), Some(Ok(Token::LineBreak)));
		assert_eq!(lexer.next(), None);
	}

	/// Helper function, get **exactly** one single valid token or panics.
	#[track_caller]
	fn lex_one_token(input: &str) -> Token<'_> {
		let mut lexer = Token::lexer(input);
		let token = lexer
			.next()
			.expect("expected at least one token")
			.expect("expected valid parse");

		assert_eq!(lexer.next(), None, "required at most one token");
		token
	}
}