use std::collections::BTreeSet;
use crate::ft::FtError;
use crate::registry::VectorTable;
use crate::schema::MetadataFieldType;
#[derive(Clone, Copy, Debug, PartialEq)]
pub enum NumericBound {
NegInf,
PosInf,
Inclusive(f64),
Exclusive(f64),
}
impl NumericBound {
#[must_use]
pub fn satisfies(&self, value: f64, is_lower: bool) -> bool {
match (*self, is_lower) {
(Self::NegInf, true) | (Self::PosInf, false) => true,
(Self::NegInf, false) | (Self::PosInf, true) => false,
(Self::Inclusive(b), true) => value >= b,
(Self::Inclusive(b), false) => value <= b,
(Self::Exclusive(b), true) => value > b,
(Self::Exclusive(b), false) => value < b,
}
}
}
#[derive(Clone, Debug, PartialEq)]
pub enum FilterExpr {
All,
NumericRange {
field: String,
min: NumericBound,
max: NumericBound,
},
TextSubstring {
field: String,
query: Vec<u8>,
},
TagSet {
field: String,
tags: Vec<Vec<u8>>,
},
And(Vec<FilterExpr>),
Or(Vec<FilterExpr>),
Not(Box<FilterExpr>),
}
pub fn parse_expr(query: &[u8]) -> Result<FilterExpr, FtError> {
let mut p = Parser::new(query);
let expr = p.parse_or()?;
p.skip_ws();
if !p.at_end() {
return Err(FtError::Syntax(format!(
"FT.SEARCH filter: extra tokens at offset {} ({})",
p.pos,
String::from_utf8_lossy(&p.input[p.pos..]),
)));
}
Ok(expr)
}
pub fn evaluate(
expr: &FilterExpr,
table: &VectorTable,
universe: &BTreeSet<Vec<u8>>,
) -> Result<BTreeSet<Vec<u8>>, FtError> {
match expr {
FilterExpr::All => Ok(universe.clone()),
FilterExpr::NumericRange { field, min, max } => {
ensure_field_kind(table, field, MetadataFieldType::Numeric)?;
let mut out = BTreeSet::new();
for key in universe {
let Some(value) = lookup_numeric_metadata(table, key, field)? else {
continue;
};
if min.satisfies(value, true) && max.satisfies(value, false) {
out.insert(key.clone());
}
}
Ok(out)
}
FilterExpr::TextSubstring { field, query } => {
ensure_text_field(table, field)?;
if let Some(hits) = table.search_text_substring(field, query) {
let mut out = BTreeSet::new();
for (key, _) in hits {
if universe.contains(&key) {
out.insert(key);
}
}
Ok(out)
} else {
let mut out = BTreeSet::new();
for key in universe {
if let Some(bytes) = lookup_metadata_bytes(table, key, field)? {
if contains_subseq(&bytes, query) {
out.insert(key.clone());
}
}
}
Ok(out)
}
}
FilterExpr::TagSet { field, tags } => {
ensure_field_kind(table, field, MetadataFieldType::Tag)?;
let separator = table
.schema
.metadata_fields
.iter()
.find(|f| f.name == *field)
.map_or(b',', crate::schema::MetadataField::effective_tag_separator);
let mut out = BTreeSet::new();
for key in universe {
let Some(bytes) = lookup_metadata_bytes(table, key, field)? else {
continue;
};
if any_tag_present(&bytes, separator, tags) {
out.insert(key.clone());
}
}
Ok(out)
}
FilterExpr::And(children) => {
if children.is_empty() {
return Ok(universe.clone());
}
let mut acc = evaluate(&children[0], table, universe)?;
for child in &children[1..] {
let next = evaluate(child, table, universe)?;
acc = acc.intersection(&next).cloned().collect();
if acc.is_empty() {
break;
}
}
Ok(acc)
}
FilterExpr::Or(children) => {
let mut acc = BTreeSet::new();
for child in children {
let next = evaluate(child, table, universe)?;
acc.extend(next);
}
Ok(acc)
}
FilterExpr::Not(child) => {
let inner = evaluate(child, table, universe)?;
Ok(universe.difference(&inner).cloned().collect())
}
}
}
fn ensure_field_kind(
table: &VectorTable,
field: &str,
kind: MetadataFieldType,
) -> Result<(), FtError> {
let f = table
.schema
.metadata_fields
.iter()
.find(|f| f.name == field)
.ok_or_else(|| FtError::Syntax(format!("FT.SEARCH filter: unknown field {field}")))?;
if f.field_type != kind {
return Err(FtError::Syntax(format!(
"FT.SEARCH filter: field {field} is {actual:?}, not {kind:?}",
actual = f.field_type,
)));
}
Ok(())
}
fn ensure_text_field(table: &VectorTable, field: &str) -> Result<(), FtError> {
if table.has_text_field(field) {
return Ok(());
}
Err(FtError::Syntax(format!(
"FT.SEARCH filter: field {field} is not declared TEXT",
)))
}
fn lookup_metadata_bytes(
table: &VectorTable,
key: &[u8],
field: &str,
) -> Result<Option<Vec<u8>>, FtError> {
let row = table
.engine
.get(key)
.map_err(|e| FtError::Engine(e.to_string()))?;
let Some(row) = row else {
return Ok(None);
};
let Some(value) = row.metadata.get(field) else {
return Ok(None);
};
let bytes = match value {
serde_json::Value::String(s) => s.clone().into_bytes(),
other => other.to_string().into_bytes(),
};
Ok(Some(bytes))
}
fn lookup_numeric_metadata(
table: &VectorTable,
key: &[u8],
field: &str,
) -> Result<Option<f64>, FtError> {
let Some(bytes) = lookup_metadata_bytes(table, key, field)? else {
return Ok(None);
};
let s = match std::str::from_utf8(&bytes) {
Ok(s) => s.trim(),
Err(_) => return Ok(None),
};
Ok(s.parse::<f64>().ok().filter(|v| v.is_finite()))
}
fn any_tag_present(blob: &[u8], separator: u8, wanted: &[Vec<u8>]) -> bool {
for chunk in blob.split(|b| *b == separator) {
let trimmed = trim_ascii(chunk);
for w in wanted {
if trim_ascii(w) == trimmed {
return true;
}
}
}
false
}
fn trim_ascii(bytes: &[u8]) -> &[u8] {
let mut start = 0;
let mut end = bytes.len();
while start < end && bytes[start].is_ascii_whitespace() {
start += 1;
}
while end > start && bytes[end - 1].is_ascii_whitespace() {
end -= 1;
}
&bytes[start..end]
}
fn contains_subseq(haystack: &[u8], needle: &[u8]) -> bool {
if needle.is_empty() {
return true;
}
if haystack.len() < needle.len() {
return false;
}
haystack.windows(needle.len()).any(|w| w == needle)
}
struct Parser<'a> {
input: &'a [u8],
pos: usize,
}
impl<'a> Parser<'a> {
fn new(input: &'a [u8]) -> Self {
Self { input, pos: 0 }
}
fn at_end(&self) -> bool {
self.pos >= self.input.len()
}
fn peek(&self) -> Option<u8> {
self.input.get(self.pos).copied()
}
fn bump(&mut self) -> Option<u8> {
let b = self.peek()?;
self.pos += 1;
Some(b)
}
fn skip_ws(&mut self) {
while let Some(b) = self.peek() {
if b.is_ascii_whitespace() {
self.pos += 1;
} else {
break;
}
}
}
fn parse_or(&mut self) -> Result<FilterExpr, FtError> {
let first = self.parse_and()?;
let mut alts: Vec<FilterExpr> = vec![first];
loop {
self.skip_ws();
if self.peek() == Some(b'|') {
self.bump();
self.skip_ws();
alts.push(self.parse_and()?);
} else {
break;
}
}
if alts.len() == 1 {
Ok(alts.pop().expect("single child"))
} else {
Ok(FilterExpr::Or(alts))
}
}
fn parse_and(&mut self) -> Result<FilterExpr, FtError> {
self.skip_ws();
let first = self.parse_atom()?;
let mut conjs: Vec<FilterExpr> = vec![first];
loop {
self.skip_ws();
match self.peek() {
None | Some(b')' | b'|') => break,
_ => {
let next = self.parse_atom()?;
conjs.push(next);
}
}
}
if conjs.len() == 1 {
Ok(conjs.pop().expect("single child"))
} else {
Ok(FilterExpr::And(conjs))
}
}
fn parse_atom(&mut self) -> Result<FilterExpr, FtError> {
self.skip_ws();
match self.peek() {
None => Err(FtError::Syntax(
"FT.SEARCH filter: unexpected end of expression".to_string(),
)),
Some(b'-') => {
self.bump();
let inner = self.parse_atom()?;
Ok(FilterExpr::Not(Box::new(inner)))
}
Some(b'(') => {
self.bump();
let inner = self.parse_or()?;
self.skip_ws();
if self.peek() != Some(b')') {
return Err(FtError::Syntax(
"FT.SEARCH filter: missing ')' in group".to_string(),
));
}
self.bump();
Ok(inner)
}
Some(b'*') => {
self.bump();
Ok(FilterExpr::All)
}
Some(b'@') => self.parse_field_query(),
Some(b'"') => Err(FtError::Unsupported(
"FT.SEARCH filter: quoted phrases not supported in this build".to_string(),
)),
Some(b'{' | b'[' | b'|' | b')' | b':') => Err(FtError::Syntax(format!(
"FT.SEARCH filter: unexpected character '{}'",
self.peek().unwrap_or(0) as char,
))),
Some(_) => Err(FtError::Syntax(format!(
"FT.SEARCH filter: bare term '{}' must be qualified with '@<field>:'",
String::from_utf8_lossy(self.read_word_preview()),
))),
}
}
fn parse_field_query(&mut self) -> Result<FilterExpr, FtError> {
debug_assert_eq!(self.peek(), Some(b'@'));
self.bump();
let field_start = self.pos;
while let Some(b) = self.peek() {
if b == b':' || b.is_ascii_whitespace() {
break;
}
self.bump();
}
let field_bytes = &self.input[field_start..self.pos];
if field_bytes.is_empty() {
return Err(FtError::Syntax(
"FT.SEARCH filter: empty field name after '@'".to_string(),
));
}
let field = std::str::from_utf8(field_bytes)
.map(str::to_string)
.map_err(|_| FtError::Syntax("FT.SEARCH filter: field is not UTF-8".to_string()))?;
if self.peek() != Some(b':') {
return Err(FtError::Syntax(format!(
"FT.SEARCH filter: expected ':' after field {field}",
)));
}
self.bump();
match self.peek() {
Some(b'[') => self.parse_numeric_range(field),
Some(b'{') => self.parse_tag_set(field),
Some(b'"') => Err(FtError::Unsupported(
"FT.SEARCH filter: quoted phrases not supported in this build".to_string(),
)),
Some(b'(' | b'|' | b')') | None => Err(FtError::Syntax(format!(
"FT.SEARCH filter: empty value for field {field}",
))),
Some(b) if b.is_ascii_whitespace() => Err(FtError::Syntax(format!(
"FT.SEARCH filter: empty value for field {field}",
))),
Some(_) => self.parse_text_token(field),
}
}
fn parse_numeric_range(&mut self, field: String) -> Result<FilterExpr, FtError> {
debug_assert_eq!(self.peek(), Some(b'['));
self.bump();
self.skip_ws();
let min = self.parse_numeric_bound()?;
self.skip_ws();
let max = self.parse_numeric_bound()?;
self.skip_ws();
if self.peek() != Some(b']') {
let preview = self.read_word_preview();
if !preview.is_empty() {
return Err(FtError::Unsupported(format!(
"FT.SEARCH filter: extra token '{}' in numeric range (geo filters not supported)",
String::from_utf8_lossy(preview),
)));
}
return Err(FtError::Syntax(
"FT.SEARCH filter: missing ']' in numeric range".to_string(),
));
}
self.bump();
Ok(FilterExpr::NumericRange { field, min, max })
}
fn parse_numeric_bound(&mut self) -> Result<NumericBound, FtError> {
self.skip_ws();
let mut exclusive = false;
if self.peek() == Some(b'(') {
self.bump();
exclusive = true;
}
let start = self.pos;
while let Some(b) = self.peek() {
if b.is_ascii_whitespace() || b == b']' {
break;
}
self.bump();
}
let token_bytes = &self.input[start..self.pos];
if token_bytes.is_empty() {
return Err(FtError::Syntax(
"FT.SEARCH filter: empty numeric bound".to_string(),
));
}
let raw = std::str::from_utf8(token_bytes)
.map_err(|_| FtError::Syntax("FT.SEARCH filter: bound is not UTF-8".to_string()))?;
let lower = raw.to_ascii_lowercase();
match lower.as_str() {
"+inf" | "inf" => {
if exclusive {
return Err(FtError::Syntax(
"FT.SEARCH filter: cannot apply '(' to infinity".to_string(),
));
}
Ok(NumericBound::PosInf)
}
"-inf" => {
if exclusive {
return Err(FtError::Syntax(
"FT.SEARCH filter: cannot apply '(' to infinity".to_string(),
));
}
Ok(NumericBound::NegInf)
}
_ => {
let parsed: f64 = raw.parse().map_err(|_| {
FtError::Syntax(format!("FT.SEARCH filter: invalid numeric bound {raw}"))
})?;
if !parsed.is_finite() {
return Err(FtError::Syntax(format!(
"FT.SEARCH filter: non-finite numeric bound {raw}",
)));
}
if exclusive {
Ok(NumericBound::Exclusive(parsed))
} else {
Ok(NumericBound::Inclusive(parsed))
}
}
}
}
fn parse_tag_set(&mut self, field: String) -> Result<FilterExpr, FtError> {
debug_assert_eq!(self.peek(), Some(b'{'));
self.bump();
let mut tags: Vec<Vec<u8>> = Vec::new();
loop {
self.skip_ws();
let start = self.pos;
while let Some(b) = self.peek() {
if b == b'|' || b == b'}' || b.is_ascii_whitespace() {
break;
}
self.bump();
}
let bytes = &self.input[start..self.pos];
if bytes.is_empty() {
return Err(FtError::Syntax(format!(
"FT.SEARCH filter: empty tag in @{field}:{{...}}"
)));
}
tags.push(bytes.to_vec());
self.skip_ws();
match self.peek() {
Some(b'|') => {
self.bump();
}
Some(b'}') => {
self.bump();
break;
}
_ => {
return Err(FtError::Syntax(format!(
"FT.SEARCH filter: expected '|' or '}}' in @{field}:{{...}}"
)));
}
}
}
if tags.is_empty() {
return Err(FtError::Syntax(format!(
"FT.SEARCH filter: @{field}:{{}} requires at least one tag",
)));
}
Ok(FilterExpr::TagSet { field, tags })
}
fn parse_text_token(&mut self, field: String) -> Result<FilterExpr, FtError> {
let start = self.pos;
while let Some(b) = self.peek() {
if b.is_ascii_whitespace()
|| b == b'|'
|| b == b'('
|| b == b')'
|| b == b'{'
|| b == b'}'
|| b == b'['
|| b == b']'
{
break;
}
self.bump();
}
let bytes = &self.input[start..self.pos];
if bytes.is_empty() {
return Err(FtError::Syntax(format!(
"FT.SEARCH filter: empty value for field {field}",
)));
}
Ok(FilterExpr::TextSubstring {
field,
query: bytes.to_vec(),
})
}
fn read_word_preview(&self) -> &[u8] {
let start = self.pos;
let mut end = start;
while end < self.input.len() && !self.input[end].is_ascii_whitespace() {
end += 1;
}
&self.input[start..end.min(start + 32)]
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_match_all_returns_all_variant() {
let expr = parse_expr(b"*").unwrap();
assert_eq!(expr, FilterExpr::All);
}
#[test]
fn parse_numeric_range_inclusive() {
let expr = parse_expr(b"@score:[100 200]").unwrap();
match expr {
FilterExpr::NumericRange { field, min, max } => {
assert_eq!(field, "score");
assert_eq!(min, NumericBound::Inclusive(100.0));
assert_eq!(max, NumericBound::Inclusive(200.0));
}
other => panic!("unexpected {other:?}"),
}
}
#[test]
fn parse_numeric_range_exclusive_upper() {
let expr = parse_expr(b"@score:[100 (200]").unwrap();
let FilterExpr::NumericRange { min, max, .. } = expr else {
panic!("expected numeric range");
};
assert_eq!(min, NumericBound::Inclusive(100.0));
assert_eq!(max, NumericBound::Exclusive(200.0));
}
#[test]
fn parse_numeric_range_inf_bounds() {
let expr = parse_expr(b"@score:[-inf +inf]").unwrap();
let FilterExpr::NumericRange { min, max, .. } = expr else {
panic!("expected numeric range");
};
assert_eq!(min, NumericBound::NegInf);
assert_eq!(max, NumericBound::PosInf);
}
#[test]
fn parse_text_substring_token() {
let expr = parse_expr(b"@body:errno").unwrap();
match expr {
FilterExpr::TextSubstring { field, query } => {
assert_eq!(field, "body");
assert_eq!(query, b"errno");
}
other => panic!("unexpected {other:?}"),
}
}
#[test]
fn parse_tag_set_multi_value() {
let expr = parse_expr(b"@status:{ok|warn|error}").unwrap();
let FilterExpr::TagSet { field, tags } = expr else {
panic!("expected tag set");
};
assert_eq!(field, "status");
assert_eq!(
tags,
vec![b"ok".to_vec(), b"warn".to_vec(), b"error".to_vec()]
);
}
#[test]
fn parse_and_two_clauses() {
let expr = parse_expr(b"@score:[100 200] @body:errno").unwrap();
let FilterExpr::And(children) = expr else {
panic!("expected AND");
};
assert_eq!(children.len(), 2);
assert!(matches!(&children[0], FilterExpr::NumericRange { .. }));
assert!(matches!(&children[1], FilterExpr::TextSubstring { .. }));
}
#[test]
fn parse_or_two_clauses() {
let expr = parse_expr(b"@status:{ok} | @status:{warn}").unwrap();
let FilterExpr::Or(children) = expr else {
panic!("expected OR");
};
assert_eq!(children.len(), 2);
}
#[test]
fn parse_negation_unary() {
let expr = parse_expr(b"-@status:{stale}").unwrap();
let FilterExpr::Not(inner) = expr else {
panic!("expected NOT");
};
assert!(matches!(*inner, FilterExpr::TagSet { .. }));
}
#[test]
fn parse_grouping_with_parens() {
let expr = parse_expr(b"(@status:{a} | @status:{b}) @body:foo").unwrap();
let FilterExpr::And(children) = expr else {
panic!("expected AND at the top");
};
assert_eq!(children.len(), 2);
assert!(matches!(&children[0], FilterExpr::Or(_)));
assert!(matches!(&children[1], FilterExpr::TextSubstring { .. }));
}
#[test]
fn parse_empty_value_errors() {
let err = parse_expr(b"@body:").unwrap_err();
assert!(matches!(err, FtError::Syntax(_)));
}
#[test]
fn parse_unbalanced_paren_errors() {
let err = parse_expr(b"(@a:b").unwrap_err();
assert!(matches!(err, FtError::Syntax(_)));
}
#[test]
fn parse_geo_style_extra_token_returns_unsupported() {
let err = parse_expr(b"@loc:[10 20 5 km]").unwrap_err();
assert!(matches!(err, FtError::Unsupported(_)));
}
#[test]
fn parse_quoted_phrase_returns_unsupported() {
let err = parse_expr(b"@body:\"hello world\"").unwrap_err();
assert!(matches!(err, FtError::Unsupported(_)));
}
#[test]
fn numeric_bound_satisfies_inclusive() {
let b = NumericBound::Inclusive(10.0);
assert!(b.satisfies(10.0, true));
assert!(b.satisfies(11.0, true));
assert!(!b.satisfies(9.0, true));
assert!(b.satisfies(9.0, false));
assert!(!b.satisfies(11.0, false));
}
#[test]
fn numeric_bound_satisfies_exclusive() {
let b = NumericBound::Exclusive(10.0);
assert!(!b.satisfies(10.0, true));
assert!(b.satisfies(11.0, true));
assert!(!b.satisfies(10.0, false));
}
#[test]
fn numeric_bound_satisfies_inf() {
assert!(NumericBound::NegInf.satisfies(-1e300, true));
assert!(NumericBound::PosInf.satisfies(1e300, false));
assert!(!NumericBound::NegInf.satisfies(0.0, false));
assert!(!NumericBound::PosInf.satisfies(0.0, true));
}
#[test]
fn any_tag_present_is_separator_aware() {
assert!(any_tag_present(b"ok,warn", b',', &[b"warn".to_vec()]));
assert!(!any_tag_present(b"ok,warn", b',', &[b"info".to_vec()]));
assert!(any_tag_present(
b"ok|warn",
b'|',
&[b"warn".to_vec(), b"foo".to_vec()]
));
assert!(any_tag_present(b" ok , warn", b',', &[b"ok".to_vec()]));
}
#[test]
fn contains_subseq_is_byte_substring() {
assert!(contains_subseq(b"hello world", b"lo wo"));
assert!(!contains_subseq(b"hello world", b"xyz"));
assert!(contains_subseq(b"abc", b""));
assert!(!contains_subseq(b"a", b"abc"));
}
}