use std::io::{BufReader, Read};
use once_cell::sync::Lazy;
use regex::Regex;
use crate::{bytes, error::Error};
const EOF: char = '\0';
#[derive(Clone, Copy)]
pub enum JoinOp {
And,
Or,
}
impl JoinOp {
pub fn from_str(str: &str) -> Option<Self> {
match str {
"&&" => Some(Self::And),
"||" => Some(Self::Or),
_ => None,
}
}
fn as_str(&self) -> &str {
match self {
Self::And => "&&",
Self::Or => "||",
}
}
}
impl std::fmt::Display for JoinOp {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.as_str())
}
}
#[derive(Default, PartialEq, Clone)]
pub enum SignOp {
#[default]
None,
Eq,
Neq,
Like,
Nlike,
Lt,
Lte,
Gt,
Gte,
AnyEq,
AnyNeq,
AnyLike,
AnyNlike,
AnyLt,
AnyLte,
AnyGt,
AnyGte,
}
impl SignOp {
pub fn from_str(str: &str) -> Option<Self> {
match str {
"=" => Some(Self::Eq),
"!=" => Some(Self::Neq),
"~" => Some(Self::Like),
"!~" => Some(Self::Nlike),
"<" => Some(Self::Lt),
"<=" => Some(Self::Lte),
">" => Some(Self::Gt),
">=" => Some(Self::Gte),
"?=" => Some(Self::AnyEq),
"?!=" => Some(Self::AnyNeq),
"?~" => Some(Self::AnyLike),
"?!~" => Some(Self::AnyNlike),
"?<" => Some(Self::AnyLt),
"?<=" => Some(Self::AnyLte),
"?>" => Some(Self::AnyGt),
"?>=" => Some(Self::AnyGte),
_ => None,
}
}
fn as_str(&self) -> &str {
match self {
Self::None => "",
Self::Eq => "=",
Self::Neq => "!=",
Self::Like => "~",
Self::Nlike => "!~",
Self::Lt => "<",
Self::Lte => "<=",
Self::Gt => ">",
Self::Gte => ">=",
Self::AnyEq => "?=",
Self::AnyNeq => "?!=",
Self::AnyLike => "?~",
Self::AnyNlike => "?!~",
Self::AnyLt => "?<",
Self::AnyLte => "?<=",
Self::AnyGt => "?>",
Self::AnyGte => "?>=",
}
}
}
impl std::fmt::Display for SignOp {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{}", self.as_str())
}
}
#[derive(Default, PartialEq, Clone)]
pub enum Token {
#[default]
None,
Eof(String),
Ws(String),
Join(String),
Sign(String),
Identifier(String),
Number(String),
Text(String),
Group(String),
Comment(String),
}
impl Token {
pub fn kind(&self) -> &str {
match self {
Self::None => "",
Self::Eof(_) => "eof",
Self::Ws(_) => "whitespace",
Self::Join(_) => "join",
Self::Sign(_) => "sign",
Self::Identifier(_) => "identifier", Self::Number(_) => "number",
Self::Text(_) => "text", Self::Group(_) => "group", Self::Comment(_) => "comment",
}
}
pub fn literal(&self) -> &str {
match self {
Self::None => "",
Self::Eof(value) => value,
Self::Ws(value) => value,
Self::Join(value) => value,
Self::Sign(value) => value,
Self::Identifier(value) => value,
Self::Number(value) => value,
Self::Text(value) => value,
Self::Group(value) => value,
Self::Comment(value) => value,
}
}
}
impl std::fmt::Display for Token {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(f, "{{{} {}}}", self.kind(), self.literal())
}
}
pub struct Scanner {
buffer: Vec<u8>,
pos: usize,
}
impl Scanner {
pub fn new(mut r: BufReader<impl Read>) -> Result<Self, Error> {
let mut buffer = Vec::new();
r.read_to_end(&mut buffer)
.map_err(|err| Error::Buffer(err.to_string()))?;
Ok(Scanner { buffer, pos: 0 })
}
pub fn scan(&mut self) -> Result<Token, Error> {
let ch = self.read();
if is_whitespace_char(ch) {
self.unread();
return self.scan_whitespace();
}
if is_group_start_char(ch) {
self.unread();
return self.scan_group();
}
if is_identifier_start_char(ch) {
self.unread();
return self.scan_identifier();
}
if is_number_start_char(ch) {
self.unread();
return self.scan_number();
}
if is_text_start_char(ch) {
self.unread();
return self.scan_text(false);
}
if is_sign_start_char(ch) {
self.unread();
return self.scan_sign();
}
if is_join_start_char(ch) {
self.unread();
return self.scan_join();
}
if is_comment_start_char(ch) {
self.unread();
return self.scan_comment();
}
if ch == EOF {
return Ok(Token::Eof(ch.to_string()));
}
Err(Error::Unexpected(format!("Unexpected character {ch}")))
}
fn scan_whitespace(&mut self) -> Result<Token, Error> {
let mut buf = bytes::Buffer::new();
loop {
let ch = self.read();
if ch == EOF {
break;
}
if !is_whitespace_char(ch) {
self.unread();
break;
}
buf.write_char(ch)?;
}
Ok(Token::Ws(buf.into_string()?))
}
fn scan_identifier(&mut self) -> Result<Token, Error> {
let mut buf = bytes::Buffer::new();
loop {
let ch = self.read();
if ch == EOF {
break;
}
if !is_identifier_start_char(ch) && !is_digit_char(ch) && ch != '.' && ch != ':' {
self.unread();
break;
}
buf.write_char(ch)?
}
let literal = buf.into_string()?;
if !is_identifier(&literal) {
return Err(Error::Invalid(format!("Invalid identifier {literal}")));
}
Ok(Token::Identifier(literal))
}
fn scan_number(&mut self) -> Result<Token, Error> {
let mut buf = bytes::Buffer::new();
buf.write_char(self.read())?;
loop {
let ch = self.read();
if ch == EOF {
break;
}
if !is_digit_char(ch) && ch != '.' {
self.unread();
break;
}
buf.write_char(ch)?;
}
let literal = buf.into_string()?;
if !is_number(&literal) {
return Err(Error::Invalid(format!("Invalid number {literal}")));
}
Ok(Token::Number(literal))
}
fn scan_text(&mut self, preserve_quotes: bool) -> Result<Token, Error> {
let mut buf = bytes::Buffer::new();
let first_ch = self.read();
buf.write_char(first_ch)?;
let mut prev_ch = '\0';
let mut has_matching_quotes = false;
loop {
let ch = self.read();
if ch == EOF {
break;
}
buf.write_char(ch)?;
if ch == first_ch && prev_ch != '\\' {
has_matching_quotes = true;
break;
}
prev_ch = ch;
}
let mut literal = buf.into_string()?;
if !has_matching_quotes {
return Err(Error::Invalid(format!("Invalid quoted text {literal}")));
} else if !preserve_quotes {
literal = literal[1..literal.len() - 1].to_string();
let first_ch_str = first_ch.to_string();
literal = literal.replace(&("\\".to_owned() + &first_ch_str), &first_ch_str);
}
Ok(Token::Text(literal))
}
fn scan_sign(&mut self) -> Result<Token, Error> {
let mut buf = bytes::Buffer::new();
loop {
let ch = self.read();
if ch == EOF {
break;
}
if !is_sign_start_char(ch) {
self.unread();
break;
}
buf.write_char(ch)?;
}
let literal = buf.into_string()?;
if !is_sign_operator(&literal) {
return Err(Error::Invalid(format!("Invalid sign operator {literal}")));
}
Ok(Token::Sign(literal))
}
fn scan_join(&mut self) -> Result<Token, Error> {
let mut buf = bytes::Buffer::new();
loop {
let ch = self.read();
if ch == EOF {
break;
}
if !is_join_start_char(ch) {
self.unread();
break;
}
buf.write_char(ch)?;
}
let literal = buf.into_string()?;
if !is_join_operator(&literal) {
return Err(Error::Invalid(format!("Invalid join operator {literal}",)));
}
Ok(Token::Join(literal))
}
fn scan_group(&mut self) -> Result<Token, Error> {
let mut buf = bytes::Buffer::new();
let first_char = self.read();
let mut open_groups = 1;
loop {
let ch = self.read();
if ch == EOF {
break;
}
if is_group_start_char(ch) {
open_groups += 1;
buf.write_char(ch)?;
} else if is_text_start_char(ch) {
self.unread();
let t = self.scan_text(true)?;
buf.write_string(t.literal())?
} else if ch == ')' {
open_groups -= 1;
if open_groups <= 0 {
break;
} else {
buf.write_char(ch)?;
}
} else {
buf.write_char(ch)?;
}
}
let literal = buf.into_string()?;
if !is_group_start_char(first_char) || open_groups > 0 {
return Err(Error::Invalid(format!(
"Invalid formatted group - missing {open_groups} closing bracket(s)"
)));
}
Ok(Token::Group(literal))
}
fn scan_comment(&mut self) -> Result<Token, Error> {
let mut buf = bytes::Buffer::new();
if !is_comment_start_char(self.read()) || !is_comment_start_char(self.read()) {
return Err(Error::Invalid("Invalid comment".to_owned()));
}
loop {
let ch = self.read();
if ch == EOF || ch == '\n' {
break;
}
buf.write_char(ch)?;
}
let literal = buf.into_string()?;
Ok(Token::Comment(literal.trim().to_owned()))
}
fn read(&mut self) -> char {
if self.pos == self.buffer.len() {
return EOF;
}
let ch = char::from(self.buffer[self.pos]);
self.pos += 1;
ch
}
fn unread(&mut self) {
if self.pos > 0 {
self.pos -= 1;
}
}
}
fn is_whitespace_char(ch: char) -> bool {
ch == ' ' || ch == '\t' || ch == '\n'
}
fn is_letter_char(ch: char) -> bool {
ch.is_ascii_lowercase() || ch.is_ascii_uppercase()
}
fn is_digit_char(ch: char) -> bool {
ch.is_ascii_digit()
}
fn is_identifier_start_char(ch: char) -> bool {
is_letter_char(ch) || ch == '_' || ch == '@' || ch == '#'
}
fn is_text_start_char(ch: char) -> bool {
ch == '\'' || ch == '"'
}
fn is_number_start_char(ch: char) -> bool {
ch == '-' || is_digit_char(ch)
}
fn is_sign_start_char(ch: char) -> bool {
ch == '=' || ch == '?' || ch == '!' || ch == '>' || ch == '<' || ch == '~'
}
fn is_join_start_char(ch: char) -> bool {
ch == '&' || ch == '|'
}
fn is_group_start_char(ch: char) -> bool {
ch == '('
}
fn is_comment_start_char(ch: char) -> bool {
ch == '/'
}
fn is_sign_operator(literal: &str) -> bool {
SignOp::from_str(literal).is_some()
}
fn is_join_operator(literal: &str) -> bool {
JoinOp::from_str(literal).is_some()
}
fn is_number(literal: &str) -> bool {
if literal.is_empty() || literal.ends_with('.') {
return false;
}
literal.parse::<f64>().is_ok()
}
fn is_identifier(literal: &str) -> bool {
static IDENTIFIER_REGEX: Lazy<Regex> =
Lazy::new(|| Regex::new(r"^[\@\#\_]?[\w\.\:]*\w+$").unwrap());
IDENTIFIER_REGEX.is_match(literal)
}
#[cfg(test)]
mod tests {
use std::io::BufReader;
use crate::scanner::Token;
use super::Scanner;
#[test]
fn test_new_scanner() {
let s = Scanner::new(BufReader::new("test".as_bytes())).unwrap();
let data_bytes = &s.buffer[0..4];
let data = std::str::from_utf8(data_bytes).unwrap();
assert!(
data == "test",
"Expected the scanner reader data to be \"test\", got {data}"
)
}
#[test]
fn test_scanner_scan() {
struct Output {
error: bool,
print: &'static str,
}
struct TestScenario {
text: &'static str,
expects: Vec<Output>,
}
let test_scenarios = vec![
TestScenario {
text: r" ",
expects: vec![Output {
error: false,
print: r"{whitespace }",
}],
},
TestScenario {
text: r"test 123",
expects: vec![
Output {
error: false,
print: r"{identifier test}",
},
Output {
error: false,
print: r"{whitespace }",
},
Output {
error: false,
print: r"{number 123}",
},
],
},
TestScenario {
text: r"test",
expects: vec![Output {
error: false,
print: r"{identifier test}",
}],
},
TestScenario {
text: r"@test.123",
expects: vec![Output {
error: false,
print: r"{identifier @test.123}",
}],
},
TestScenario {
text: r"_test.123",
expects: vec![Output {
error: false,
print: r"{identifier _test.123}",
}],
},
TestScenario {
text: r"#test.123:456",
expects: vec![Output {
error: false,
print: r"{identifier #test.123:456}",
}],
},
TestScenario {
text: r".test.123",
expects: vec![
Output {
error: true,
print: r"{unexpected .}",
},
Output {
error: false,
print: r"{identifier test.123}",
},
],
},
TestScenario {
text: r":test.123",
expects: vec![
Output {
error: true,
print: r"{unexpected :}",
},
Output {
error: false,
print: r"{identifier test.123}",
},
],
},
TestScenario {
text: r"test#@",
expects: vec![Output {
error: true,
print: r"{identifier test#@}",
}],
},
TestScenario {
text: r"test'",
expects: vec![
Output {
error: false,
print: r"{identifier test}",
},
Output {
error: true,
print: r"{text '}",
},
],
},
TestScenario {
text: r#"test"d"#,
expects: vec![
Output {
error: false,
print: r"{identifier test}",
},
Output {
error: true,
print: r#"{text \"d}"#,
},
],
},
TestScenario {
text: r"123",
expects: vec![Output {
error: false,
print: r"{number 123}",
}],
},
TestScenario {
text: r"-123",
expects: vec![Output {
error: false,
print: r"{number -123}",
}],
},
TestScenario {
text: r"-123.456",
expects: vec![Output {
error: false,
print: r"{number -123.456}",
}],
},
TestScenario {
text: r"123.456",
expects: vec![Output {
error: false,
print: r"{number 123.456}",
}],
},
TestScenario {
text: r".123",
expects: vec![
Output {
error: true,
print: r"{unexpected .}",
},
Output {
error: false,
print: r"{number 123}",
},
],
},
TestScenario {
text: r"- 123",
expects: vec![
Output {
error: true,
print: r"{number -}",
},
Output {
error: false,
print: r"{whitespace }",
},
Output {
error: false,
print: r"{number 123}",
},
],
},
TestScenario {
text: r"12-3",
expects: vec![
Output {
error: false,
print: r"{number 12}",
},
Output {
error: false,
print: r"{number -3}",
},
],
},
TestScenario {
text: r"123.abc",
expects: vec![
Output {
error: true,
print: r"{number 123.}",
},
Output {
error: false,
print: r"{identifier abc}",
},
],
},
TestScenario {
text: r#""""#,
expects: vec![Output {
error: false,
print: r"{text }",
}],
},
TestScenario {
text: r"''",
expects: vec![Output {
error: false,
print: r"{text }",
}],
},
TestScenario {
text: r"'test'",
expects: vec![Output {
error: false,
print: r"{text test}",
}],
},
TestScenario {
text: r"'te\'st'",
expects: vec![Output {
error: false,
print: r"{text te'st}",
}],
},
TestScenario {
text: r#""te\"st""#,
expects: vec![Output {
error: false,
print: r#"{text te"st}"#,
}],
},
TestScenario {
text: r#""tes@#,;!@#%^'\"t""#,
expects: vec![Output {
error: false,
print: r#"{text tes@#,;!@#%^'"t}"#,
}],
},
TestScenario {
text: r#"'tes@#,;!@#%^\'"t'"#,
expects: vec![Output {
error: false,
print: r#"{text tes@#,;!@#%^'"t}"#,
}],
},
TestScenario {
text: r#""test"#,
expects: vec![Output {
error: true,
print: r#"{text "test}"#,
}],
},
TestScenario {
text: r"'test",
expects: vec![Output {
error: true,
print: r"{text 'test}",
}],
},
TestScenario {
text: r"&&||",
expects: vec![Output {
error: true,
print: r"{join &&||}",
}],
},
TestScenario {
text: r"&& ||",
expects: vec![
Output {
error: false,
print: r"{join &&}",
},
Output {
error: false,
print: r"{whitespace }",
},
Output {
error: false,
print: r"{join ||}",
},
],
},
TestScenario {
text: r"'||test&&'&&123",
expects: vec![
Output {
error: false,
print: r"{text ||test&&}",
},
Output {
error: false,
print: r"{join &&}",
},
Output {
error: false,
print: r"{number 123}",
},
],
},
TestScenario {
text: r"=!=",
expects: vec![Output {
error: true,
print: r"{sign =!=}",
}],
},
TestScenario {
text: r"= != ~ !~ > >= < <= ?= ?!= ?~ ?!~ ?> ?>= ?< ?<=",
expects: vec![
Output {
error: false,
print: r"{sign =}",
},
Output {
error: false,
print: r"{whitespace }",
},
Output {
error: false,
print: r"{sign !=}",
},
Output {
error: false,
print: r"{whitespace }",
},
Output {
error: false,
print: r"{sign ~}",
},
Output {
error: false,
print: r"{whitespace }",
},
Output {
error: false,
print: r"{sign !~}",
},
Output {
error: false,
print: r"{whitespace }",
},
Output {
error: false,
print: r"{sign >}",
},
Output {
error: false,
print: r"{whitespace }",
},
Output {
error: false,
print: r"{sign >=}",
},
Output {
error: false,
print: r"{whitespace }",
},
Output {
error: false,
print: r"{sign <}",
},
Output {
error: false,
print: r"{whitespace }",
},
Output {
error: false,
print: r"{sign <=}",
},
Output {
error: false,
print: r"{whitespace }",
},
Output {
error: false,
print: r"{sign ?=}",
},
Output {
error: false,
print: r"{whitespace }",
},
Output {
error: false,
print: r"{sign ?!=}",
},
Output {
error: false,
print: r"{whitespace }",
},
Output {
error: false,
print: r"{sign ?~}",
},
Output {
error: false,
print: r"{whitespace }",
},
Output {
error: false,
print: r"{sign ?!~}",
},
Output {
error: false,
print: r"{whitespace }",
},
Output {
error: false,
print: r"{sign ?>}",
},
Output {
error: false,
print: r"{whitespace }",
},
Output {
error: false,
print: r"{sign ?>=}",
},
Output {
error: false,
print: r"{whitespace }",
},
Output {
error: false,
print: r"{sign ?<}",
},
Output {
error: false,
print: r"{whitespace }",
},
Output {
error: false,
print: r"{sign ?<=}",
},
],
},
TestScenario {
text: r"a)",
expects: vec![
Output {
error: false,
print: r"{identifier a}",
},
Output {
error: true,
print: r"{unexpected )}",
},
],
},
TestScenario {
text: r"(a b c",
expects: vec![Output {
error: true,
print: r"{group a b c}",
}],
},
TestScenario {
text: r"(a b c)",
expects: vec![Output {
error: false,
print: r"{group a b c}",
}],
},
TestScenario {
text: r"((a b c))",
expects: vec![Output {
error: false,
print: r"{group (a b c)}",
}],
},
TestScenario {
text: r"((a )b c))",
expects: vec![
Output {
error: false,
print: r"{group (a )b c}",
},
Output {
error: true,
print: r"{unexpected )}",
},
],
},
TestScenario {
text: r#"("ab)("c)"#,
expects: vec![Output {
error: false,
print: r#"{group "ab)("c}"#,
}],
},
TestScenario {
text: r#"("ab)(c)"#,
expects: vec![Output {
error: true,
print: r#"{group "ab)(c)}"#,
}],
},
TestScenario {
text: r"/ test",
expects: vec![
Output {
error: true,
print: r"{comment }",
},
Output {
error: false,
print: r"{identifier test}",
},
],
},
TestScenario {
text: r"/ / test",
expects: vec![
Output {
error: true,
print: r"{comment }",
},
Output {
error: true,
print: r"{comment }",
},
Output {
error: false,
print: r"{identifier test}",
},
],
},
TestScenario {
text: r"//",
expects: vec![Output {
error: false,
print: r"{comment }",
}],
},
TestScenario {
text: r"//test",
expects: vec![Output {
error: false,
print: r"{comment test}",
}],
},
TestScenario {
text: r"// test",
expects: vec![Output {
error: false,
print: r"{comment test}",
}],
},
TestScenario {
text: r"// test1 //test2 ",
expects: vec![Output {
error: false,
print: r"{comment test1 //test2}",
}],
},
TestScenario {
text: r"///test",
expects: vec![Output {
error: false,
print: r"{comment /test}",
}],
},
];
for (i, scenario) in test_scenarios.iter().enumerate() {
let mut s = Scanner::new(BufReader::new(scenario.text.as_bytes())).unwrap();
for (j, expect) in scenario.expects.iter().enumerate() {
let token = match s.scan() {
Ok(token) => {
assert!(
!expect.error,
"({}.{}) Expected error, got ok ({})",
i, j, token
);
token
}
Err(err) => {
assert!(
expect.error,
"({}.{}) Did not expect error, got {} ({})",
i, j, err, scenario.text
);
continue;
}
};
let token_print = token.to_string();
assert!(
token_print == expect.print,
"({}.{}) Expected token {}, got {}",
i,
j,
expect.print,
token_print
);
}
let last_token = s.scan().unwrap();
assert!(
matches!(last_token, Token::Eof(_)),
"({}) Expected EOF token, got {}",
i,
last_token
);
}
}
}