use std::fmt;
pub(crate) struct Lexer<'a> {
src: &'a [u8],
input: &'a str,
pos: usize,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
pub struct ParseError {
pos: usize,
msg: &'static str,
}
impl ParseError {
pub(crate) fn new(pos: usize, msg: &'static str) -> Self {
ParseError { pos, msg }
}
#[must_use]
#[allow(dead_code)] pub fn position(&self) -> usize {
self.pos
}
#[must_use]
#[allow(dead_code)] pub fn message(&self) -> &'static str {
self.msg
}
}
impl fmt::Display for ParseError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "parse error at byte {}: {}", self.pos, self.msg)
}
}
impl std::error::Error for ParseError {}
impl<'a> Lexer<'a> {
pub(crate) fn new(input: &'a str) -> Self {
Lexer {
src: input.as_bytes(),
input,
pos: 0,
}
}
#[inline]
#[must_use]
pub(crate) fn pos(&self) -> usize {
self.pos
}
#[inline]
#[must_use]
pub(crate) fn is_eof(&self) -> bool {
self.pos >= self.src.len()
}
#[inline]
#[must_use]
pub(crate) fn peek(&self) -> u8 {
if self.pos < self.src.len() {
self.src[self.pos]
} else {
0
}
}
#[inline]
#[must_use]
pub(crate) fn peek_at(&self, offset: usize) -> u8 {
let i = self.pos + offset;
if i < self.src.len() { self.src[i] } else { 0 }
}
#[inline]
#[must_use]
pub(crate) fn slice(&self, start: usize) -> &'a str {
&self.input[start..self.pos]
}
#[inline]
#[must_use]
pub(crate) fn slice_range(&self, start: usize, end: usize) -> &'a str {
debug_assert!(
start <= end && end <= self.src.len(),
"slice_range({start}, {end}): len={}",
self.src.len()
);
&self.input[start..end]
}
#[inline]
#[must_use]
pub(crate) fn remaining(&self) -> &'a str {
&self.input[self.pos..]
}
#[inline]
pub(crate) fn set_pos(&mut self, pos: usize) {
self.pos = pos;
}
#[inline]
pub(crate) fn bump(&mut self) {
self.pos += 1;
}
#[inline]
pub(crate) fn bump_n(&mut self, n: usize) {
self.pos += n;
}
#[inline]
pub(crate) fn eat(&mut self, b: u8) -> bool {
if self.peek() == b {
self.pos += 1;
true
} else {
false
}
}
pub(crate) fn eat_str(&mut self, s: &[u8]) -> bool {
if self.pos + s.len() <= self.src.len() && &self.src[self.pos..self.pos + s.len()] == s {
self.pos += s.len();
true
} else {
false
}
}
pub(crate) fn skip_blanks(&mut self) {
while self.pos < self.src.len() {
match self.src[self.pos] {
b' ' | b'\t' => self.pos += 1,
_ => break,
}
}
}
pub(crate) fn skip_comment(&mut self) {
if self.peek() == b'#' {
while self.pos < self.src.len() && self.src[self.pos] != b'\n' {
self.pos += 1;
}
}
}
#[must_use]
pub(crate) fn read_name(&mut self) -> &'a str {
let start = self.pos;
if self.pos < self.src.len()
&& (self.src[self.pos].is_ascii_alphabetic() || self.src[self.pos] == b'_')
{
self.pos += 1;
while self.pos < self.src.len()
&& (self.src[self.pos].is_ascii_alphanumeric() || self.src[self.pos] == b'_')
{
self.pos += 1;
}
}
self.slice(start)
}
#[must_use]
pub(crate) fn read_number(&mut self) -> &'a str {
let start = self.pos;
while self.pos < self.src.len() && self.src[self.pos].is_ascii_digit() {
self.pos += 1;
}
self.slice(start)
}
pub(crate) fn scan_squote(&mut self) -> Result<&'a str, ParseError> {
let start = self.pos;
while self.pos < self.src.len() {
if self.src[self.pos] == b'\'' {
let content = self.slice(start);
self.pos += 1;
return Ok(content);
}
self.pos += 1;
}
Err(self.err("unterminated single quote"))
}
#[must_use]
pub(crate) fn at_keyword(&self, kw: &[u8]) -> bool {
let end = self.pos + kw.len();
if end > self.src.len() {
return false;
}
if &self.src[self.pos..end] != kw {
return false;
}
if kw.len() == 1 && is_meta(kw[0]) {
return true;
}
end >= self.src.len() || is_meta(self.src[end])
}
#[must_use]
pub(crate) fn at_any_keyword(&self, keywords: &[&[u8]]) -> bool {
keywords.iter().any(|kw| self.at_keyword(kw))
}
pub(crate) fn err(&self, msg: &'static str) -> ParseError {
ParseError::new(self.pos, msg)
}
}
#[inline]
#[must_use]
pub(crate) const fn is_meta(b: u8) -> bool {
matches!(
b,
b' ' | b'\t' | b'\n' | b';' | b'&' | b'|' | b'(' | b')' | b'<' | b'>' | b'\0'
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn peek_and_eof() {
let lex = Lexer::new("");
assert!(lex.is_eof());
assert_eq!(lex.peek(), 0);
let lex = Lexer::new("a");
assert!(!lex.is_eof());
assert_eq!(lex.peek(), b'a');
}
#[test]
fn eat_and_bump() {
let mut lex = Lexer::new("ab");
assert!(lex.eat(b'a'));
assert!(!lex.eat(b'a'));
assert!(lex.eat(b'b'));
assert!(lex.is_eof());
}
#[test]
fn eat_str() {
let mut lex = Lexer::new("then done");
assert!(lex.eat_str(b"then"));
assert_eq!(lex.peek(), b' ');
lex.bump();
assert!(lex.eat_str(b"done"));
assert!(lex.is_eof());
}
#[test]
fn skip_blanks_not_newlines() {
let mut lex = Lexer::new(" \t\nfoo");
lex.skip_blanks();
assert_eq!(lex.peek(), b'\n');
}
#[test]
fn read_name() {
let mut lex = Lexer::new("FOO_bar123 rest");
assert_eq!(lex.read_name(), "FOO_bar123");
assert_eq!(lex.peek(), b' ');
}
#[test]
fn read_name_underscore_start() {
let mut lex = Lexer::new("_private");
assert_eq!(lex.read_name(), "_private");
}
#[test]
fn read_name_no_match() {
let mut lex = Lexer::new("123abc");
assert_eq!(lex.read_name(), "");
assert_eq!(lex.pos(), 0);
}
#[test]
fn read_number() {
let mut lex = Lexer::new("42rest");
assert_eq!(lex.read_number(), "42");
}
#[test]
fn scan_squote() {
let mut lex = Lexer::new("hello world'rest");
let content = lex.scan_squote().unwrap();
assert_eq!(content, "hello world");
assert_eq!(lex.peek(), b'r');
}
#[test]
fn at_keyword() {
let lex = Lexer::new("then ");
assert!(lex.at_keyword(b"then"));
assert!(!lex.at_keyword(b"the"));
}
#[test]
fn at_keyword_eof() {
let lex = Lexer::new("fi");
assert!(lex.at_keyword(b"fi"));
}
#[test]
fn at_keyword_no_boundary() {
let lex = Lexer::new("done_stuff");
assert!(!lex.at_keyword(b"done"));
}
#[test]
fn skip_comment() {
let mut lex = Lexer::new("# this is a comment\nnext");
lex.skip_comment();
assert_eq!(lex.peek(), b'\n');
}
#[test]
fn parse_error_accessors() {
let err = ParseError::new(42, "test error");
assert_eq!(err.position(), 42);
assert_eq!(err.message(), "test error");
}
}