use std::{char, iter::Peekable, str::CharIndices};
#[derive(Debug, PartialEq)]
pub enum QueryToken<'a> {
Attribute(bool, &'a str, Vec<&'a str>),
Freetext(&'a str),
}
pub struct QueryLexer<'a> {
query_str: &'a str,
char_it: Peekable<CharIndices<'a>>,
}
impl<'a> QueryLexer<'a> {
pub fn new(query_str: &'a str) -> Self {
QueryLexer {
query_str,
char_it: query_str.char_indices().peekable(),
}
}
fn next_token(&mut self) -> Option<QueryToken<'a>> {
self.skip_whitespace();
let &(start_idx, first_char) = self.char_it.peek()?;
if first_char == '+' || first_char == '-' {
return Some(self.read_attribute());
}
Some(self.read_freetext(start_idx))
}
fn skip_whitespace(&mut self) {
while let Some(&(_, c)) = self.char_it.peek() {
if !char::is_whitespace(c) {
return;
}
self.char_it.next();
}
}
fn read_freetext(&mut self, start_idx: usize) -> QueryToken<'a> {
while let Some(&(idx, c)) = self.char_it.peek() {
if char::is_whitespace(c) {
return QueryToken::Freetext(&self.query_str[start_idx..idx]);
}
self.char_it.next();
}
QueryToken::Freetext(&self.query_str[start_idx..])
}
fn read_attribute(&mut self) -> QueryToken<'a> {
let (start_idx, first_char) = self.char_it.next().unwrap();
let (attribute_index, attribute_ok) = self.read_attribute_index(start_idx + 1);
if !attribute_ok || attribute_index.is_empty() {
return self.read_freetext(start_idx);
}
let (colon_idx, c) = self
.char_it
.next()
.expect("if attribute_ok is true there must be a next char");
assert_eq!(
c, ':',
"if attribute_ok is true, the next char should be a colon"
);
let attribute_values = self.read_attribute_values(colon_idx + 1);
QueryToken::Attribute(first_char == '+', attribute_index, attribute_values)
}
fn read_attribute_index(&mut self, start_idx: usize) -> (&'a str, bool) {
while let Some(&(idx, c)) = self.char_it.peek() {
if c == ':' || !char::is_alphanumeric(c) {
return (&self.query_str[start_idx..idx], c == ':');
}
self.char_it.next();
}
("", false)
}
fn read_attribute_values(&mut self, mut value_start_idx: usize) -> Vec<&'a str> {
let mut values = vec![];
while let Some(&(idx, c)) = self.char_it.peek() {
if c == ',' || char::is_whitespace(c) {
if value_start_idx < idx {
values.push(&self.query_str[value_start_idx..idx]);
}
if char::is_whitespace(c) {
return values;
}
value_start_idx = idx + 1;
}
self.char_it.next();
}
let last_value = &self.query_str[value_start_idx..];
if !last_value.is_empty() {
values.push(last_value);
}
values
}
}
impl<'a> Iterator for QueryLexer<'a> {
type Item = QueryToken<'a>;
fn next(&mut self) -> Option<Self::Item> {
self.next_token()
}
}
#[cfg(test)]
mod tests {
use super::QueryToken::*;
use super::*;
macro_rules! query_lexer_test {
($name:ident $query:literal; $($res:expr),* $(,)?) => {
#[test]
fn $name() {
let ql = QueryLexer::new($query);
let result: Vec<QueryToken> = ql.collect();
assert_eq!(result, vec![$($res),*]);
}
};
}
query_lexer_test! {empty "";}
query_lexer_test! {single_char "A"; Freetext("A")}
query_lexer_test! {single_umlaut "ร"; Freetext("ร")}
query_lexer_test! {single_emoji "โ๐ผ"; Freetext("โ๐ผ")}
query_lexer_test! {single_plus "+"; Freetext("+")}
query_lexer_test! {single_minus "-"; Freetext("-")}
query_lexer_test! {single_colon ":"; Freetext(":")}
query_lexer_test! {single_attribute "+a:b"; Attribute(true, "a", vec!["b"])}
query_lexer_test! {half_attribute "+a"; Freetext("+a")}
query_lexer_test! {plus_colon "+:"; Freetext("+:")}
query_lexer_test! {colon_plus ":+"; Freetext(":+")}
query_lexer_test! {empty_attribute "+a:"; Attribute(true, "a", vec![])}
query_lexer_test! {empty_attribute_space "+a: "; Attribute(true, "a", vec![])}
query_lexer_test! {
basic "hello +zipcode:12345 +pet:Dog -name:Hans world";
Freetext("hello"),
Attribute(true, "zipcode", vec!["12345"]),
Attribute(true, "pet", vec!["Dog"]),
Attribute(false, "name", vec!["Hans"]),
Freetext("world"),
}
query_lexer_test! {
spaces " \t hello +zipcode:12345 \n +pet:Dog -name:Hans world ";
Freetext("hello"),
Attribute(true, "zipcode", vec!["12345"]),
Attribute(true, "pet", vec!["Dog"]),
Attribute(false, "name", vec!["Hans"]),
Freetext("world"),
}
query_lexer_test! {
comma "+a1:v1 +a2:v1,v2 +a3:v1,v2,v3 -a4:v1,,v2 -a5:v1,v2, +a6:,,,";
Attribute(true, "a1", vec!["v1"]),
Attribute(true, "a2", vec!["v1", "v2"]),
Attribute(true, "a3", vec!["v1", "v2", "v3"]),
Attribute(false, "a4", vec!["v1", "v2"]),
Attribute(false, "a5", vec!["v1", "v2"]),
Attribute(true, "a6", vec![]),
}
query_lexer_test! {
garbage "\ne376$$bf% sfse-ยง$\t hello+world รรร-+- ๐โ๐ผ\n\t";
Freetext("e376$$bf%"),
Freetext("sfse-ยง$"),
Freetext("hello+world"),
Freetext("รรร-+-"),
Freetext("๐โ๐ผ"),
}
query_lexer_test! {
incomplete " + - +a -b +a-b ";
Freetext("+"),
Freetext("-"),
Freetext("+a"),
Freetext("-b"),
Freetext("+a-b"),
}
query_lexer_test! {
chained "+a:hello+b:world-foo:+bar,-baz:,buzz";
Attribute(true, "a", vec!["hello+b:world-foo:+bar", "-baz:", "buzz"]),
}
}