use crate::Operator;
use crate::error::{ParseError, ParseErrorAt};
use crate::parsing::span::Span;
use crate::parsing::token::Token::Op;
use crate::parsing::token::{Spanned, Token, operator_map};
struct Lexer<'a> {
input: &'a str,
chars: std::iter::Peekable<std::str::CharIndices<'a>>,
}
impl<'a> Lexer<'a> {
fn new(input: &'a str) -> Self {
Lexer {
input,
chars: input.char_indices().peekable(),
}
}
fn next_token(&mut self) -> Result<Option<Spanned>, ParseError> {
loop {
match self.chars.peek() {
Some(&(_, c)) if c.is_whitespace() => {
self.chars.next();
}
_ => break,
}
}
let (start, ch) = match self.chars.next() {
Some(t) => t,
None => return Ok(None),
};
let spanned = match ch {
'(' => Spanned::new(Token::LParen, Span::new(start, start + 1)),
')' => Spanned::new(Token::RParen, Span::new(start, start + 1)),
',' => Spanned::new(Token::Comma, Span::new(start, start + 1)),
';' => Spanned::new(Token::Semi, Span::new(start, start + 1)),
'=' => {
let mut end = start + 1;
let mut closed = false;
for (pos, c) in self.chars.by_ref() {
end = pos + c.len_utf8();
if c == '=' {
closed = true;
break;
}
}
if !closed {
return Err(self.error(
start,
1,
"unterminated operator (missing closing '=')",
));
}
let op_str = &self.input[start..end];
match operator_map().get(op_str) {
Some(op) => Spanned::new(Op(op.clone()), Span::new(start, end)),
None => {
return Err(self.error(
start,
end - start,
&format!("unknown operator {op_str:?}"),
));
}
}
}
'!' => match self.chars.next_if(|&(_, c)| c == '=') {
Some(_) => Spanned::new(Op(Operator::Neq), Span::new(start, start + 2)),
None => return Err(self.error(start, 1, "expected '=' after '!'")),
},
'<' => match self.chars.next_if(|&(_, c)| c == '=') {
Some(_) => Spanned::new(Op(Operator::Lte), Span::new(start, start + 2)),
None => Spanned::new(Op(Operator::Lt), Span::new(start, start + 1)),
},
'>' => match self.chars.next_if(|&(_, c)| c == '=') {
Some(_) => Spanned::new(Op(Operator::Gte), Span::new(start, start + 2)),
None => Spanned::new(Op(Operator::Gt), Span::new(start, start + 1)),
},
'"' | '\'' => self.scan_quoted(start, ch)?,
c if !is_word_end(c) => self.scan_literal(start, c),
c => {
return Err(self.error(
start,
c.len_utf8(),
&format!("unexpected character {c:?}"),
));
}
};
Ok(Some(spanned))
}
fn scan_quoted(&mut self, start: usize, quote: char) -> Result<Spanned, ParseError> {
let content_start = start + 1; loop {
match self.chars.next() {
None => return Err(self.error(start, 1, "unterminated string literal")),
Some((pos, c)) if c == quote => {
let content = self.input[content_start..pos].to_string();
return Ok(Spanned::new(
Token::QuotedStr(content),
Span::new(start, pos + 1),
));
}
_ => {}
}
}
}
fn scan_literal(&mut self, start: usize, first: char) -> Spanned {
let mut end = start + first.len_utf8();
while let Some(&(pos, c)) = self.chars.peek() {
if is_word_end(c) {
break;
}
end = pos + c.len_utf8();
self.chars.next();
}
let word = &self.input[start..end];
let token = match word {
"null" => Token::Null,
"true" => Token::Bool(true),
"false" => Token::Bool(false),
w => try_integer(w)
.map(Token::Integer)
.or_else(|| try_float(w).map(Token::Float))
.or_else(|| as_datetime(w))
.or_else(|| as_date(w))
.unwrap_or_else(|| Token::Word(w.to_string())),
};
Spanned::new(token, Span::new(start, end))
}
fn error(&self, pos: usize, span_len: usize, message: &str) -> ParseError {
let span = Span::new(pos, pos + span_len);
let (line, col) = span.line_col(self.input);
ParseError::At(ParseErrorAt {
line,
col,
span_len,
snippet: span.source_line(self.input).to_string(),
message: message.to_string(),
})
}
}
fn is_date(s: &str) -> bool {
let b = s.as_bytes();
b.len() == 10
&& b[4] == b'-'
&& b[7] == b'-'
&& b[..4].iter().all(|c| c.is_ascii_digit())
&& b[5..7].iter().all(|c| c.is_ascii_digit())
&& b[8..10].iter().all(|c| c.is_ascii_digit())
}
fn is_datetime(s: &str) -> bool {
let b = s.as_bytes();
b.len() == 20
&& is_date(&s[..10])
&& b[10] == b'T'
&& b[13] == b':'
&& b[16] == b':'
&& b[19] == b'Z'
&& b[11..13].iter().all(|c| c.is_ascii_digit())
&& b[14..16].iter().all(|c| c.is_ascii_digit())
&& b[17..19].iter().all(|c| c.is_ascii_digit())
}
fn as_datetime(s: &str) -> Option<Token> {
if is_datetime(s) {
Some(Token::DateTime(s.to_string()))
} else {
None
}
}
fn as_date(s: &str) -> Option<Token> {
if is_date(s) {
Some(Token::Date(s.to_string()))
} else {
None
}
}
fn try_integer(s: &str) -> Option<i64> {
let rest = s.strip_prefix('-').unwrap_or(s);
if rest.is_empty() || !rest.chars().all(|c| c.is_ascii_digit()) {
return None;
}
s.parse::<i64>().ok()
}
fn try_float(s: &str) -> Option<f64> {
let rest = s.strip_prefix('-').unwrap_or(s);
let dot = rest.find('.')?;
let int_part = &rest[..dot];
let frac_part = &rest[dot + 1..];
if int_part.is_empty() || !int_part.chars().all(|c| c.is_ascii_digit()) {
return None;
}
if frac_part.is_empty() || !frac_part.chars().all(|c| c.is_ascii_digit()) {
return None;
}
s.parse::<f64>().ok()
}
fn is_word_end(c: char) -> bool {
c.is_whitespace()
|| matches!(c, '(' | ')' | ',' | ';' | '"' | '\'')
|| matches!(c, '=' | '<' | '>' | '!')
}
pub fn tokenize(source: &str) -> Result<Vec<Spanned>, ParseError> {
let mut lexer = Lexer::new(source);
let mut tokens = Vec::new();
while let Some(tok) = lexer.next_token()? {
tokens.push(tok);
}
Ok(tokens)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::ast::Operator;
fn tokens(source: &str) -> Vec<Token> {
tokenize(source)
.unwrap()
.into_iter()
.map(|s| s.token)
.collect()
}
#[test]
fn simple_constraint() {
assert_eq!(
tokens("name==Alice"),
vec![
Token::Word("name".into()),
Token::Op(Operator::Eq),
Token::Word("Alice".into()),
]
);
}
#[test]
fn and_and_or_separators() {
assert_eq!(
tokens("a==1;b>2,c<3"),
vec![
Token::Word("a".into()),
Token::Op(Operator::Eq),
Token::Integer(1),
Token::Semi,
Token::Word("b".into()),
Token::Op(Operator::Gt),
Token::Integer(2),
Token::Comma,
Token::Word("c".into()),
Token::Op(Operator::Lt),
Token::Integer(3),
]
);
}
#[test]
fn in_list() {
assert_eq!(
tokens("role=in=(admin,user)"),
vec![
Token::Word("role".into()),
Token::Op(Operator::In),
Token::LParen,
Token::Word("admin".into()),
Token::Comma,
Token::Word("user".into()),
Token::RParen,
]
);
}
#[test]
fn quoted_string_double() {
assert_eq!(
tokens(r#"name=="hello world""#),
vec![
Token::Word("name".into()),
Token::Op(Operator::Eq),
Token::QuotedStr("hello world".into()),
]
);
}
#[test]
fn quoted_string_single() {
assert_eq!(
tokens("name=='hello world'"),
vec![
Token::Word("name".into()),
Token::Op(Operator::Eq),
Token::QuotedStr("hello world".into()),
]
);
}
#[test]
fn whitespace_is_ignored() {
assert_eq!(tokens("a == 1"), tokens("a==1"));
assert_eq!(tokens(" a == 1 "), tokens("a==1"));
}
#[test]
fn fiql_longest_match() {
assert_eq!(tokens("x=between=(1,2)")[1], Token::Op(Operator::Between));
assert_eq!(tokens("x=notnull=true")[1], Token::Op(Operator::NotNull));
}
#[test]
fn symbolic_two_char_before_one_char() {
assert_eq!(tokens("a<=b")[1], Token::Op(Operator::Lte));
assert_eq!(tokens("a>=b")[1], Token::Op(Operator::Gte));
assert_eq!(tokens("a!=b")[1], Token::Op(Operator::Neq));
}
#[test]
fn unterminated_string_returns_error() {
let err = tokenize(r#"name=="oops"#);
assert!(matches!(err, Err(ParseError::At(_))));
}
#[test]
fn null_and_bool_keywords() {
assert_eq!(tokens("null"), vec![Token::Null]);
assert_eq!(tokens("true"), vec![Token::Bool(true)]);
assert_eq!(tokens("false"), vec![Token::Bool(false)]);
}
#[test]
fn integer_and_float_literals() {
assert_eq!(tokens("0"), vec![Token::Integer(0)]);
assert_eq!(tokens("42"), vec![Token::Integer(42)]);
assert_eq!(tokens("-7"), vec![Token::Integer(-7)]);
assert_eq!(tokens("3.5"), vec![Token::Float(3.5)]);
assert_eq!(tokens("-0.5"), vec![Token::Float(-0.5)]);
}
#[test]
fn leading_dot_is_word_not_float() {
assert_eq!(tokens(".14"), vec![Token::Word(".14".into())]);
assert_eq!(tokens("-.14"), vec![Token::Word("-.14".into())]);
}
#[test]
fn trailing_dot_is_word_not_float() {
assert_eq!(tokens("3."), vec![Token::Word("3.".into())]);
}
#[test]
fn non_digit_in_fractional_part_is_word() {
assert_eq!(tokens("3.14a5"), vec![Token::Word("3.14a5".into())]);
}
#[test]
fn date_literal() {
assert_eq!(tokens("2024-01-15"), vec![Token::Date("2024-01-15".into())]);
assert_eq!(tokens("1900-12-31"), vec![Token::Date("1900-12-31".into())]);
}
#[test]
fn datetime_literal() {
assert_eq!(
tokens("2024-01-15T10:30:00Z"),
vec![Token::DateTime("2024-01-15T10:30:00Z".into())]
);
}
#[test]
fn datetime_takes_priority_over_date_prefix() {
let toks = tokens("2024-01-15T10:30:00Z");
assert_eq!(toks.len(), 1);
assert!(matches!(toks[0], Token::DateTime(_)));
}
#[test]
fn date_in_constraint() {
assert_eq!(
tokens("created_at>=2024-01-15"),
vec![
Token::Word("created_at".into()),
Token::Op(Operator::Gte),
Token::Date("2024-01-15".into()),
]
);
}
#[test]
fn malformed_date_stays_word() {
assert_eq!(tokens("2024-1-15"), vec![Token::Word("2024-1-15".into())]);
assert_eq!(tokens("2024/01/15"), vec![Token::Word("2024/01/15".into())]);
}
#[test]
fn malformed_datetime_stays_word() {
assert_eq!(
tokens("2024-01-15T10:30:00"),
vec![Token::Word("2024-01-15T10:30:00".into())]
);
}
#[test]
fn unterminated_fiql_operator() {
let err = tokenize("name=foo");
if let Err(ParseError::At(e)) = err {
assert!(
e.message.contains("unterminated operator"),
"got: {}",
e.message
);
} else {
panic!("expected ParseError::At");
}
}
#[test]
fn unknown_fiql_operator() {
let err = tokenize("name=foo=bar");
if let Err(ParseError::At(e)) = err {
assert!(e.message.contains("unknown operator"), "got: {}", e.message);
assert!(
e.message.contains("=foo="),
"message should quote the bad op: {}",
e.message
);
} else {
panic!("expected ParseError::At");
}
}
#[test]
fn lone_exclamation_mark() {
let err = tokenize("name!bar");
assert!(matches!(err, Err(ParseError::At(_))));
}
#[test]
fn unterminated_fiql_col_is_correct() {
let err = tokenize("name=bad");
if let Err(ParseError::At(e)) = err {
assert_eq!(e.line, 1);
assert_eq!(e.col, 5);
} else {
panic!("expected ParseError::At");
}
}
#[test]
fn unknown_operator_span_covers_full_token() {
let err = tokenize("=xyz=");
if let Err(ParseError::At(e)) = err {
assert_eq!(e.col, 1);
assert_eq!(e.span_len, 5);
} else {
panic!("expected ParseError::At");
}
}
#[test]
fn spans_are_correct() {
let spanned = tokenize("name==Alice").unwrap();
assert_eq!(spanned[0].span, Span::new(0, 4)); assert_eq!(spanned[1].span, Span::new(4, 6)); assert_eq!(spanned[2].span, Span::new(6, 11)); }
#[test]
fn parens_have_correct_spans() {
let spanned = tokenize("(a)").unwrap();
assert_eq!(spanned[0].span, Span::new(0, 1));
assert_eq!(spanned[2].span, Span::new(2, 3));
}
#[test]
fn accented_chars_in_word_tokens() {
assert_eq!(
tokens("prénom==André"),
vec![
Token::Word("prénom".into()),
Token::Op(Operator::Eq),
Token::Word("André".into()),
]
);
}
#[test]
fn accented_word_byte_spans() {
let spanned = tokenize("prénom==André").unwrap();
assert_eq!(spanned[0].span, Span::new(0, 7)); assert_eq!(spanned[1].span, Span::new(7, 9)); assert_eq!(spanned[2].span, Span::new(9, 15)); }
#[test]
fn cjk_chars_in_word() {
assert_eq!(
tokens("label==日本語"),
vec![
Token::Word("label".into()),
Token::Op(Operator::Eq),
Token::Word("日本語".into()),
]
);
}
#[test]
fn emoji_in_word() {
assert_eq!(
tokens("tag==🚀"),
vec![
Token::Word("tag".into()),
Token::Op(Operator::Eq),
Token::Word("🚀".into()),
]
);
}
#[test]
fn unicode_in_quoted_string_double() {
assert_eq!(
tokens(r#"name=="Ségolène Royal""#),
vec![
Token::Word("name".into()),
Token::Op(Operator::Eq),
Token::QuotedStr("Ségolène Royal".into()),
]
);
}
#[test]
fn cjk_in_quoted_string() {
assert_eq!(
tokens(r#"label=="日本語テスト""#),
vec![
Token::Word("label".into()),
Token::Op(Operator::Eq),
Token::QuotedStr("日本語テスト".into()),
]
);
}
#[test]
fn emoji_in_quoted_string() {
assert_eq!(
tokens(r#"tag=="🚀 rocket""#),
vec![
Token::Word("tag".into()),
Token::Op(Operator::Eq),
Token::QuotedStr("🚀 rocket".into()),
]
);
}
#[test]
fn unterminated_string_with_unicode() {
let err = tokenize(r#"name=="café"#);
assert!(matches!(err, Err(ParseError::At(_))));
}
}